regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102
 103 struct RExC_state_t {
 104     U32         flags;                  /* RXf_* are we folding, multilining? */
 105     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 106     char        *precomp;               /* uncompiled string. */
 107     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 108     regexp      *rx;                    /* perl core regexp structure */
 109     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 110     char        *start;                 /* Start of input for compile */
 111     char        *end;                   /* End of input for compile */
 112     char        *parse;                 /* Input-scan pointer. */
 113     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 114     regnode     *emit_start;            /* Start of emitted-code area */
 115     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 116     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 117                                            implies compiling, so don't emit */
 118     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 119                                            large enough for the largest
 120                                            non-EXACTish node, so can use it as
 121                                            scratch in pass1 */
 122     I32         naughty;                /* How bad is this pattern? */
 123     I32         sawback;                /* Did we see \1, ...? */
 124     U32         seen;
 125     SSize_t     size;                   /* Code size. */
 126     I32                npar;                        /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
 127     I32         nestroot;               /* root parens we are in - used by accept */
 128     I32         extralen;
 129     I32         seen_zerolen;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved through */
 145     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 146     I32         in_lookbehind;
 147     I32         contains_locale;
 148     I32         contains_i;
 149     I32         override_recoding;
 150     I32         in_multi_char_class;
 151     struct reg_code_block *code_blocks; /* positions of literal (?{})
 152                                             within pattern */
 153     int         num_code_blocks;        /* size of code_blocks[] */
 154     int         code_index;             /* next code_blocks[] slot */
 155 #if ADD_TO_REGEXEC
 156     char        *starttry;              /* -Dr: where regtry was called. */
 157 #define RExC_starttry   (pRExC_state->starttry)
 158 #endif
 159     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 160 #ifdef DEBUGGING
 161     const char  *lastparse;
 162     I32         lastnum;
 163     AV          *paren_name_list;       /* idx -> name */
 164 #define RExC_lastparse  (pRExC_state->lastparse)
 165 #define RExC_lastnum    (pRExC_state->lastnum)
 166 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 167 #endif
 168 };
 169
 170 #define RExC_flags      (pRExC_state->flags)
 171 #define RExC_pm_flags   (pRExC_state->pm_flags)
 172 #define RExC_precomp    (pRExC_state->precomp)
 173 #define RExC_rx_sv      (pRExC_state->rx_sv)
 174 #define RExC_rx         (pRExC_state->rx)
 175 #define RExC_rxi        (pRExC_state->rxi)
 176 #define RExC_start      (pRExC_state->start)
 177 #define RExC_end        (pRExC_state->end)
 178 #define RExC_parse      (pRExC_state->parse)
 179 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 180 #ifdef RE_TRACK_PATTERN_OFFSETS
 181 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 182 #endif
 183 #define RExC_emit       (pRExC_state->emit)
 184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 185 #define RExC_emit_start (pRExC_state->emit_start)
 186 #define RExC_emit_bound (pRExC_state->emit_bound)
 187 #define RExC_naughty    (pRExC_state->naughty)
 188 #define RExC_sawback    (pRExC_state->sawback)
 189 #define RExC_seen       (pRExC_state->seen)
 190 #define RExC_size       (pRExC_state->size)
 191 #define RExC_npar       (pRExC_state->npar)
 192 #define RExC_nestroot   (pRExC_state->nestroot)
 193 #define RExC_extralen   (pRExC_state->extralen)
 194 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 195 #define RExC_utf8       (pRExC_state->utf8)
 196 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 197 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 198 #define RExC_open_parens        (pRExC_state->open_parens)
 199 #define RExC_close_parens       (pRExC_state->close_parens)
 200 #define RExC_opend      (pRExC_state->opend)
 201 #define RExC_paren_names        (pRExC_state->paren_names)
 202 #define RExC_recurse    (pRExC_state->recurse)
 203 #define RExC_recurse_count      (pRExC_state->recurse_count)
 204 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 205 #define RExC_study_chunk_recursed_bytes        (pRExC_state->study_chunk_recursed_bytes)
 206 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 207 #define RExC_contains_locale    (pRExC_state->contains_locale)
 208 #define RExC_contains_i (pRExC_state->contains_i)
 209 #define RExC_override_recoding (pRExC_state->override_recoding)
 210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 211
 212
 213 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 214 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 215         ((*s) == '{' && regcurly(s, FALSE)))
 216
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 224  * character.  (There needs to be a case: in the switch statement in regexec.c
 225  * for any node marked SIMPLE.)  Note that this is not the same thing as
 226  * REGNODE_SIMPLE */
 227 #define SIMPLE          0x02
 228 #define SPSTART         0x04    /* Starts with * or + */
 229 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 230 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 231 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 232
 233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 234
 235 /* whether trie related optimizations are enabled */
 236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 237 #define TRIE_STUDY_OPT
 238 #define FULL_TRIE_STUDY
 239 #define TRIE_STCLASS
 240 #endif
 241
 242
 243
 244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 245 #define PBITVAL(paren) (1 << ((paren) & 7))
 246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 249
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (!UTF) {                           \
 252                                          *flagp = RESTART_UTF8;            \
 253                                          return NULL;                      \
 254                                      }                                     \
 255                         } STMT_END
 256
 257 /* This converts the named class defined in regcomp.h to its equivalent class
 258  * number defined in handy.h. */
 259 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 260 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 261
 262 #define _invlist_union_complement_2nd(a, b, output) \
 263                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 264 #define _invlist_intersection_complement_2nd(a, b, output) \
 265                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277     /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285      /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292     This is the position the string must appear at, or not before.
 293     It also implicitly (when combined with minlenp) tells us how many
 294     characters must match before the string we are searching for.
 295     Likewise when combined with minlenp and the length of the string it
 296     tells us how many characters must appear after the string we have
 297     found.
 298
 299   - max_offset
 300     Only used for floating strings. This is the rightmost point that
 301     the string can appear at. If set to SSize_t_MAX it indicates that the
 302     string can occur infinitely far to the right.
 303
 304   - minlenp
 305     A pointer to the minimum number of characters of the pattern that the
 306     string was found inside. This is important as in the case of positive
 307     lookahead or positive lookbehind we can have multiple patterns
 308     involved. Consider
 309
 310     /(?=FOO).*F/
 311
 312     The minimum length of the pattern overall is 3, the minimum length
 313     of the lookahead part is 3, but the minimum length of the part that
 314     will actually match is 1. So 'FOO's minimum length is 3, but the
 315     minimum length for the F is 1. This is important as the minimum length
 316     is used to determine offsets in front of and behind the string being
 317     looked for.  Since strings can be composites this is the length of the
 318     pattern at the time it was committed with a scan_commit. Note that
 319     the length is calculated by study_chunk, so that the minimum lengths
 320     are not known until the full pattern has been compiled, thus the
 321     pointer to the value.
 322
 323   - lookbehind
 324
 325     In the case of lookbehind the string being searched for can be
 326     offset past the start point of the final matching string.
 327     If this value was just blithely removed from the min_offset it would
 328     invalidate some of the calculations for how many chars must match
 329     before or after (as they are derived from min_offset and minlen and
 330     the length of the string being searched for).
 331     When the final pattern is compiled and the data is moved from the
 332     scan_data_t structure into the regexp structure the information
 333     about lookbehind is factored in, with the information that would
 334     have been lost precalculated in the end_shift field for the
 335     associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343     /*I32 len_min;      unused */
 344     /*I32 len_delta;    unused */
 345     SSize_t pos_min;
 346     SSize_t pos_delta;
 347     SV *last_found;
 348     SSize_t last_end;       /* min value, <0 unless valid. */
 349     SSize_t last_start_min;
 350     SSize_t last_start_max;
 351     SV **longest;           /* Either &l_fixed, or &l_float. */
 352     SV *longest_fixed;      /* longest fixed string found in pattern */
 353     SSize_t offset_fixed;   /* offset where it starts */
 354     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 355     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356     SV *longest_float;      /* longest floating string found in pattern */
 357     SSize_t offset_float_min; /* earliest point in string it can appear */
 358     SSize_t offset_float_max; /* latest point in string it can appear */
 359     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 360     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 361     I32 flags;
 362     I32 whilem_c;
 363     SSize_t *last_closep;
 364     regnode_ssc *start_class;
 365 } scan_data_t;
 366
 367 /* The below is perhaps overboard, but this allows us to save a test at the
 368  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 369  * and 'a' differ by a single bit; the same with the upper and lower case of
 370  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 371  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 372  * then inverts it to form a mask, with just a single 0, in the bit position
 373  * where the upper- and lowercase differ.  XXX There are about 40 other
 374  * instances in the Perl core where this micro-optimization could be used.
 375  * Should decide if maintenance cost is worse, before changing those
 376  *
 377  * Returns a boolean as to whether or not 'v' is either a lowercase or
 378  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 379  * compile-time constant, the generated code is better than some optimizing
 380  * compilers figure out, amounting to a mask and test.  The results are
 381  * meaningless if 'c' is not one of [A-Za-z] */
 382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 383                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 384
 385 /*
 386  * Forward declarations for pregcomp()'s friends.
 387  */
 388
 389 static const scan_data_t zero_scan_data =
 390   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 391
 392 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 393 #define SF_BEFORE_SEOL          0x0001
 394 #define SF_BEFORE_MEOL          0x0002
 395 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 396 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 397
 398 #define SF_FIX_SHIFT_EOL        (+2)
 399 #define SF_FL_SHIFT_EOL         (+4)
 400
 401 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 402 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 403
 404 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 405 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 406 #define SF_IS_INF               0x0040
 407 #define SF_HAS_PAR              0x0080
 408 #define SF_IN_PAR               0x0100
 409 #define SF_HAS_EVAL             0x0200
 410 #define SCF_DO_SUBSTR           0x0400
 411 #define SCF_DO_STCLASS_AND      0x0800
 412 #define SCF_DO_STCLASS_OR       0x1000
 413 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 414 #define SCF_WHILEM_VISITED_POS  0x2000
 415
 416 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 417 #define SCF_SEEN_ACCEPT         0x8000
 418 #define SCF_TRIE_DOING_RESTUDY 0x10000
 419
 420 #define UTF cBOOL(RExC_utf8)
 421
 422 /* The enums for all these are ordered so things work out correctly */
 423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 430
 431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 432
 433 #define OOB_NAMEDCLASS          -1
 434
 435 /* There is no code point that is out-of-bounds, so this is problematic.  But
 436  * its only current use is to initialize a variable that is always set before
 437  * looked at. */
 438 #define OOB_UNICODE             0xDEADBEEF
 439
 440 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 441 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 442
 443
 444 /* length of regex to show in messages that don't mark a position within */
 445 #define RegexLengthToShowInErrorMessages 127
 446
 447 /*
 448  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 449  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 450  * op/pragma/warn/regcomp.
 451  */
 452 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 453 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 454
 455 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 456
 457 #define REPORT_LOCATION_ARGS(offset)            \
 458                 UTF8fARG(UTF, offset, RExC_precomp), \
 459                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 460
 461 /*
 462  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 463  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 464  * "...".
 465  */
 466 #define _FAIL(code) STMT_START {                                        \
 467     const char *ellipses = "";                                          \
 468     IV len = RExC_end - RExC_precomp;                                   \
 469                                                                         \
 470     if (!SIZE_ONLY)                                                     \
 471         SAVEFREESV(RExC_rx_sv);                                         \
 472     if (len > RegexLengthToShowInErrorMessages) {                       \
 473         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 474         len = RegexLengthToShowInErrorMessages - 10;                    \
 475         ellipses = "...";                                               \
 476     }                                                                   \
 477     code;                                                               \
 478 } STMT_END
 479
 480 #define FAIL(msg) _FAIL(                            \
 481     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 482             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 483
 484 #define FAIL2(msg,arg) _FAIL(                       \
 485     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 486             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 487
 488 /*
 489  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 490  */
 491 #define Simple_vFAIL(m) STMT_START {                                    \
 492     const IV offset = RExC_parse - RExC_precomp;                        \
 493     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 494             m, REPORT_LOCATION_ARGS(offset));   \
 495 } STMT_END
 496
 497 /*
 498  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 499  */
 500 #define vFAIL(m) STMT_START {                           \
 501     if (!SIZE_ONLY)                                     \
 502         SAVEFREESV(RExC_rx_sv);                         \
 503     Simple_vFAIL(m);                                    \
 504 } STMT_END
 505
 506 /*
 507  * Like Simple_vFAIL(), but accepts two arguments.
 508  */
 509 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 510     const IV offset = RExC_parse - RExC_precomp;                        \
 511     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 512                       REPORT_LOCATION_ARGS(offset));    \
 513 } STMT_END
 514
 515 /*
 516  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 517  */
 518 #define vFAIL2(m,a1) STMT_START {                       \
 519     if (!SIZE_ONLY)                                     \
 520         SAVEFREESV(RExC_rx_sv);                         \
 521     Simple_vFAIL2(m, a1);                               \
 522 } STMT_END
 523
 524
 525 /*
 526  * Like Simple_vFAIL(), but accepts three arguments.
 527  */
 528 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 529     const IV offset = RExC_parse - RExC_precomp;                \
 530     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 531             REPORT_LOCATION_ARGS(offset));      \
 532 } STMT_END
 533
 534 /*
 535  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 536  */
 537 #define vFAIL3(m,a1,a2) STMT_START {                    \
 538     if (!SIZE_ONLY)                                     \
 539         SAVEFREESV(RExC_rx_sv);                         \
 540     Simple_vFAIL3(m, a1, a2);                           \
 541 } STMT_END
 542
 543 /*
 544  * Like Simple_vFAIL(), but accepts four arguments.
 545  */
 546 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 547     const IV offset = RExC_parse - RExC_precomp;                \
 548     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 549             REPORT_LOCATION_ARGS(offset));      \
 550 } STMT_END
 551
 552 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 553     if (!SIZE_ONLY)                                     \
 554         SAVEFREESV(RExC_rx_sv);                         \
 555     Simple_vFAIL4(m, a1, a2, a3);                       \
 556 } STMT_END
 557
 558 /* A specialized version of vFAIL2 that works with UTF8f */
 559 #define vFAIL2utf8f(m, a1) STMT_START { \
 560     const IV offset = RExC_parse - RExC_precomp;   \
 561     if (!SIZE_ONLY)                                \
 562         SAVEFREESV(RExC_rx_sv);                    \
 563     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 564             REPORT_LOCATION_ARGS(offset));         \
 565 } STMT_END
 566
 567
 568 /* m is not necessarily a "literal string", in this macro */
 569 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 572             m, REPORT_LOCATION_ARGS(offset));       \
 573 } STMT_END
 574
 575 #define ckWARNreg(loc,m) STMT_START {                                   \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 578             REPORT_LOCATION_ARGS(offset));              \
 579 } STMT_END
 580
 581 #define vWARN_dep(loc, m) STMT_START {                                  \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 584             REPORT_LOCATION_ARGS(offset));              \
 585 } STMT_END
 586
 587 #define ckWARNdep(loc,m) STMT_START {                                   \
 588     const IV offset = loc - RExC_precomp;                               \
 589     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 590             m REPORT_LOCATION,                                          \
 591             REPORT_LOCATION_ARGS(offset));              \
 592 } STMT_END
 593
 594 #define ckWARNregdep(loc,m) STMT_START {                                \
 595     const IV offset = loc - RExC_precomp;                               \
 596     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 597             m REPORT_LOCATION,                                          \
 598             REPORT_LOCATION_ARGS(offset));              \
 599 } STMT_END
 600
 601 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 602     const IV offset = loc - RExC_precomp;                               \
 603     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 604             m REPORT_LOCATION,                                          \
 605             a1, REPORT_LOCATION_ARGS(offset));  \
 606 } STMT_END
 607
 608 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 611             a1, REPORT_LOCATION_ARGS(offset));  \
 612 } STMT_END
 613
 614 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 615     const IV offset = loc - RExC_precomp;                               \
 616     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 617             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 618 } STMT_END
 619
 620 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 621     const IV offset = loc - RExC_precomp;                               \
 622     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 623             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 624 } STMT_END
 625
 626 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 627     const IV offset = loc - RExC_precomp;                               \
 628     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 629             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 630 } STMT_END
 631
 632 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 635             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 636 } STMT_END
 637
 638 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 639     const IV offset = loc - RExC_precomp;                               \
 640     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 641             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 642 } STMT_END
 643
 644
 645 /* Allow for side effects in s */
 646 #define REGC(c,s) STMT_START {                  \
 647     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 648 } STMT_END
 649
 650 /* Macros for recording node offsets.   20001227 mjd@plover.com
 651  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 652  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 653  * Element 0 holds the number n.
 654  * Position is 1 indexed.
 655  */
 656 #ifndef RE_TRACK_PATTERN_OFFSETS
 657 #define Set_Node_Offset_To_R(node,byte)
 658 #define Set_Node_Offset(node,byte)
 659 #define Set_Cur_Node_Offset
 660 #define Set_Node_Length_To_R(node,len)
 661 #define Set_Node_Length(node,len)
 662 #define Set_Node_Cur_Length(node,start)
 663 #define Node_Offset(n)
 664 #define Node_Length(n)
 665 #define Set_Node_Offset_Length(node,offset,len)
 666 #define ProgLen(ri) ri->u.proglen
 667 #define SetProgLen(ri,x) ri->u.proglen = x
 668 #else
 669 #define ProgLen(ri) ri->u.offsets[0]
 670 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 671 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 672     if (! SIZE_ONLY) {                                                  \
 673         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 674                     __LINE__, (int)(node), (int)(byte)));               \
 675         if((node) < 0) {                                                \
 676             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 677         } else {                                                        \
 678             RExC_offsets[2*(node)-1] = (byte);                          \
 679         }                                                               \
 680     }                                                                   \
 681 } STMT_END
 682
 683 #define Set_Node_Offset(node,byte) \
 684     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 685 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 686
 687 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 688     if (! SIZE_ONLY) {                                                  \
 689         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 690                 __LINE__, (int)(node), (int)(len)));                    \
 691         if((node) < 0) {                                                \
 692             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 693         } else {                                                        \
 694             RExC_offsets[2*(node)] = (len);                             \
 695         }                                                               \
 696     }                                                                   \
 697 } STMT_END
 698
 699 #define Set_Node_Length(node,len) \
 700     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 701 #define Set_Node_Cur_Length(node, start)                \
 702     Set_Node_Length(node, RExC_parse - start)
 703
 704 /* Get offsets and lengths */
 705 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 706 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 707
 708 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 709     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 710     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 711 } STMT_END
 712 #endif
 713
 714 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 715 #define EXPERIMENTAL_INPLACESCAN
 716 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 717
 718 #define DEBUG_RExC_seen() \
 719         DEBUG_OPTIMISE_MORE_r({                                                     \
 720             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                            \
 721                                                                                     \
 722             if (RExC_seen & REG_SEEN_ZERO_LEN)                                      \
 723                 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN ");                 \
 724                                                                                     \
 725             if (RExC_seen & REG_SEEN_LOOKBEHIND)                                    \
 726                 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND ");               \
 727                                                                                     \
 728             if (RExC_seen & REG_SEEN_GPOS)                                          \
 729                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS ");                     \
 730                                                                                     \
 731             if (RExC_seen & REG_SEEN_CANY)                                            \
 732                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY ");                     \
 733                                                                                     \
 734             if (RExC_seen & REG_SEEN_RECURSE)                                       \
 735                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE ");                  \
 736                                                                                     \
 737             if (RExC_seen & REG_TOP_LEVEL_BRANCHES)                                 \
 738                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES ");            \
 739                                                                                     \
 740             if (RExC_seen & REG_SEEN_VERBARG)                                       \
 741                 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG ");                  \
 742                                                                                     \
 743             if (RExC_seen & REG_SEEN_CUTGROUP)                                      \
 744                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP ");                 \
 745                                                                                     \
 746             if (RExC_seen & REG_SEEN_RUN_ON_COMMENT)                                \
 747                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT ");           \
 748                                                                                     \
 749             if (RExC_seen & REG_SEEN_EXACTF_SHARP_S)                                \
 750                 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S ");           \
 751                                                                                     \
 752             if (RExC_seen & REG_SEEN_GOSTART)                                       \
 753                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART ");                  \
 754                                                                                     \
 755             PerlIO_printf(Perl_debug_log,"\n");                                     \
 756         });
 757
 758 #define DEBUG_STUDYDATA(str,data,depth)                              \
 759 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 760     PerlIO_printf(Perl_debug_log,                                    \
 761         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 762         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 763         (int)(depth)*2, "",                                          \
 764         (IV)((data)->pos_min),                                       \
 765         (IV)((data)->pos_delta),                                     \
 766         (UV)((data)->flags),                                         \
 767         (IV)((data)->whilem_c),                                      \
 768         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 769         is_inf ? "INF " : ""                                         \
 770     );                                                               \
 771     if ((data)->last_found)                                          \
 772         PerlIO_printf(Perl_debug_log,                                \
 773             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 774             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 775             SvPVX_const((data)->last_found),                         \
 776             (IV)((data)->last_end),                                  \
 777             (IV)((data)->last_start_min),                            \
 778             (IV)((data)->last_start_max),                            \
 779             ((data)->longest &&                                      \
 780              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 781             SvPVX_const((data)->longest_fixed),                      \
 782             (IV)((data)->offset_fixed),                              \
 783             ((data)->longest &&                                      \
 784              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 785             SvPVX_const((data)->longest_float),                      \
 786             (IV)((data)->offset_float_min),                          \
 787             (IV)((data)->offset_float_max)                           \
 788         );                                                           \
 789     PerlIO_printf(Perl_debug_log,"\n");                              \
 790 });
 791
 792 /* Mark that we cannot extend a found fixed substring at this point.
 793    Update the longest found anchored substring and the longest found
 794    floating substrings if needed. */
 795
 796 STATIC void
 797 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 798                     SSize_t *minlenp, int is_inf)
 799 {
 800     const STRLEN l = CHR_SVLEN(data->last_found);
 801     const STRLEN old_l = CHR_SVLEN(*data->longest);
 802     GET_RE_DEBUG_FLAGS_DECL;
 803
 804     PERL_ARGS_ASSERT_SCAN_COMMIT;
 805
 806     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 807         SvSetMagicSV(*data->longest, data->last_found);
 808         if (*data->longest == data->longest_fixed) {
 809             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 810             if (data->flags & SF_BEFORE_EOL)
 811                 data->flags
 812                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 813             else
 814                 data->flags &= ~SF_FIX_BEFORE_EOL;
 815             data->minlen_fixed=minlenp;
 816             data->lookbehind_fixed=0;
 817         }
 818         else { /* *data->longest == data->longest_float */
 819             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 820             data->offset_float_max = (l
 821                                       ? data->last_start_max
 822                                       : (data->pos_delta == SSize_t_MAX
 823                                          ? SSize_t_MAX
 824                                          : data->pos_min + data->pos_delta));
 825             if (is_inf
 826                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 827                 data->offset_float_max = SSize_t_MAX;
 828             if (data->flags & SF_BEFORE_EOL)
 829                 data->flags
 830                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 831             else
 832                 data->flags &= ~SF_FL_BEFORE_EOL;
 833             data->minlen_float=minlenp;
 834             data->lookbehind_float=0;
 835         }
 836     }
 837     SvCUR_set(data->last_found, 0);
 838     {
 839         SV * const sv = data->last_found;
 840         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 841             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 842             if (mg)
 843                 mg->mg_len = 0;
 844         }
 845     }
 846     data->last_end = -1;
 847     data->flags &= ~SF_BEFORE_EOL;
 848     DEBUG_STUDYDATA("commit: ",data,0);
 849 }
 850
 851 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 852  * list that describes which code points it matches */
 853
 854 STATIC void
 855 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 856 {
 857     /* Set the SSC 'ssc' to match an empty string or any code point */
 858
 859     PERL_ARGS_ASSERT_SSC_ANYTHING;
 860
 861     assert(OP(ssc) == ANYOF_SYNTHETIC);
 862
 863     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 864     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 865     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 866 }
 867
 868 STATIC int
 869 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 870 {
 871     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 872      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 873      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 874      * in any way, so there's no point in using it */
 875
 876     UV start, end;
 877     bool ret;
 878
 879     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 880
 881     assert(OP(ssc) == ANYOF_SYNTHETIC);
 882
 883     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 884         return FALSE;
 885     }
 886
 887     /* See if the list consists solely of the range 0 - Infinity */
 888     invlist_iterinit(ssc->invlist);
 889     ret = invlist_iternext(ssc->invlist, &start, &end)
 890           && start == 0
 891           && end == UV_MAX;
 892
 893     invlist_iterfinish(ssc->invlist);
 894
 895     if (ret) {
 896         return TRUE;
 897     }
 898
 899     /* If e.g., both \w and \W are set, matches everything */
 900     if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
 901         int i;
 902         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 903             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 904                 return TRUE;
 905             }
 906         }
 907     }
 908
 909     return FALSE;
 910 }
 911
 912 STATIC void
 913 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 914 {
 915     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 916      * string, any code point, or any posix class under locale */
 917
 918     PERL_ARGS_ASSERT_SSC_INIT;
 919
 920     Zero(ssc, 1, regnode_ssc);
 921     OP(ssc) = ANYOF_SYNTHETIC;
 922     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 923     ssc_anything(ssc);
 924
 925     /* If any portion of the regex is to operate under locale rules,
 926      * initialization includes it.  The reason this isn't done for all regexes
 927      * is that the optimizer was written under the assumption that locale was
 928      * all-or-nothing.  Given the complexity and lack of documentation in the
 929      * optimizer, and that there are inadequate test cases for locale, many
 930      * parts of it may not work properly, it is safest to avoid locale unless
 931      * necessary. */
 932     if (RExC_contains_locale) {
 933         ANYOF_POSIXL_SETALL(ssc);
 934         ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
 935         if (RExC_contains_i) {
 936             ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
 937         }
 938     }
 939     else {
 940         ANYOF_POSIXL_ZERO(ssc);
 941     }
 942 }
 943
 944 STATIC int
 945 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 946                               const regnode_ssc *ssc)
 947 {
 948     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 949      * to the list of code points matched, and locale posix classes; hence does
 950      * not check its flags) */
 951
 952     UV start, end;
 953     bool ret;
 954
 955     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 956
 957     assert(OP(ssc) == ANYOF_SYNTHETIC);
 958
 959     invlist_iterinit(ssc->invlist);
 960     ret = invlist_iternext(ssc->invlist, &start, &end)
 961           && start == 0
 962           && end == UV_MAX;
 963
 964     invlist_iterfinish(ssc->invlist);
 965
 966     if (! ret) {
 967         return FALSE;
 968     }
 969
 970     if (RExC_contains_locale) {
 971         if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
 972             || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
 973             || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
 974         {
 975             return FALSE;
 976         }
 977         if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
 978             return FALSE;
 979         }
 980     }
 981
 982     return TRUE;
 983 }
 984
 985 STATIC SV*
 986 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 987                                   const regnode_charclass_posixl* const node)
 988 {
 989     /* Returns a mortal inversion list defining which code points are matched
 990      * by 'node', which is of type ANYOF.  Handles complementing the result if
 991      * appropriate.  If some code points aren't knowable at this time, the
 992      * returned list must, and will, contain every possible code point. */
 993
 994     SV* invlist = sv_2mortal(_new_invlist(0));
 995     unsigned int i;
 996     const U32 n = ARG(node);
 997
 998     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 999
1000     /* Look at the data structure created by S_set_ANYOF_arg() */
1001     if (n != ANYOF_NONBITMAP_EMPTY) {
1002         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1003         AV * const av = MUTABLE_AV(SvRV(rv));
1004         SV **const ary = AvARRAY(av);
1005         assert(RExC_rxi->data->what[n] == 's');
1006
1007         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1008             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1009         }
1010         else if (ary[0] && ary[0] != &PL_sv_undef) {
1011
1012             /* Here, no compile-time swash, and there are things that won't be
1013              * known until runtime -- we have to assume it could be anything */
1014             return _add_range_to_invlist(invlist, 0, UV_MAX);
1015         }
1016         else {
1017
1018             /* Here no compile-time swash, and no run-time only data.  Use the
1019              * node's inversion list */
1020             invlist = sv_2mortal(invlist_clone(ary[2]));
1021         }
1022     }
1023
1024     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1025      * inversion list for the others, but if there are code points that should
1026      * match only conditionally on the target string being UTF-8, those are
1027      * placed in the inversion list, and not the bitmap.  Since there are
1028      * circumstances under which they could match, they are included in the
1029      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1030      * here, so that when we invert below, the end result actually does include
1031      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1032      * before we add the unconditionally matched code points */
1033     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1034         _invlist_intersection_complement_2nd(invlist,
1035                                              PL_UpperLatin1,
1036                                              &invlist);
1037     }
1038
1039     /* Add in the points from the bit map */
1040     for (i = 0; i < 256; i++) {
1041         if (ANYOF_BITMAP_TEST(node, i)) {
1042             invlist = add_cp_to_invlist(invlist, i);
1043         }
1044     }
1045
1046     /* If this can match all upper Latin1 code points, have to add them
1047      * as well */
1048     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1049         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1050     }
1051
1052     /* Similarly for these */
1053     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1054         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1055     }
1056
1057     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1058         _invlist_invert(invlist);
1059     }
1060
1061     return invlist;
1062 }
1063
1064 /* These two functions currently do the exact same thing */
1065 #define ssc_init_zero           ssc_init
1066
1067 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1068 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1069
1070 STATIC void
1071 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1072 {
1073     /* Take the flags 'and_with' and accumulate them anded into the flags for
1074      * the SSC 'ssc'.  The non-SSC related flags in 'and_with' are ignored.
1075      * The flags 'and_with' should not come from another SSC (otherwise the
1076      * EMPTY_STRING flag won't work) */
1077
1078     const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1079
1080     PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1081
1082     /* Use just the SSC-related flags from 'and_with' */
1083     ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1084     ANYOF_FLAGS(ssc) |= ssc_only_flags;
1085 }
1086
1087 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1088  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1089  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1090
1091 STATIC void
1092 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1093                 const regnode_ssc *and_with)
1094 {
1095     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1096      * another SSC or a regular ANYOF class.  Can create false positives. */
1097
1098     SV* anded_cp_list;
1099     U8  anded_flags;
1100
1101     PERL_ARGS_ASSERT_SSC_AND;
1102
1103     assert(OP(ssc) == ANYOF_SYNTHETIC);
1104
1105     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1106      * the code point inversion list and just the relevant flags */
1107     if (OP(and_with) == ANYOF_SYNTHETIC) {
1108         anded_cp_list = and_with->invlist;
1109         anded_flags = ANYOF_FLAGS(and_with);
1110     }
1111     else {
1112         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1113                                         (regnode_charclass_posixl*) and_with);
1114         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1115     }
1116
1117     ANYOF_FLAGS(ssc) &= anded_flags;
1118
1119     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1120      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1121      * 'and_with' may be inverted.  When not inverted, we have the situation of
1122      * computing:
1123      *  (C1 | P1) & (C2 | P2)
1124      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1125      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1126      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1127      *                    <=  ((C1 & C2) | P1 | P2)
1128      * Alternatively, the last few steps could be:
1129      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1130      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1131      *                    <=  (C1 | C2 | (P1 & P2))
1132      * We favor the second approach if either P1 or P2 is non-empty.  This is
1133      * because these components are a barrier to doing optimizations, as what
1134      * they match cannot be known until the moment of matching as they are
1135      * dependent on the current locale, 'AND"ing them likely will reduce or
1136      * eliminate them.
1137      * But we can do better if we know that C1,P1 are in their initial state (a
1138      * frequent occurrence), each matching everything:
1139      *  (<everything>) & (C2 | P2) =  C2 | P2
1140      * Similarly, if C2,P2 are in their initial state (again a frequent
1141      * occurrence), the result is a no-op
1142      *  (C1 | P1) & (<everything>) =  C1 | P1
1143      *
1144      * Inverted, we have
1145      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1146      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1147      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1148      * */
1149
1150     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1151         && OP(and_with) != ANYOF_SYNTHETIC)
1152     {
1153         unsigned int i;
1154
1155         ssc_intersection(ssc,
1156                          anded_cp_list,
1157                          FALSE /* Has already been inverted */
1158                          );
1159
1160         /* If either P1 or P2 is empty, the intersection will be also; can skip
1161          * the loop */
1162         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1163             ANYOF_POSIXL_ZERO(ssc);
1164         }
1165         else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1166
1167             /* Note that the Posix class component P from 'and_with' actually
1168              * looks like:
1169              *      P = Pa | Pb | ... | Pn
1170              * where each component is one posix class, such as in [\w\s].
1171              * Thus
1172              *      ~P = ~(Pa | Pb | ... | Pn)
1173              *         = ~Pa & ~Pb & ... & ~Pn
1174              *        <= ~Pa | ~Pb | ... | ~Pn
1175              * The last is something we can easily calculate, but unfortunately
1176              * is likely to have many false positives.  We could do better
1177              * in some (but certainly not all) instances if two classes in
1178              * P have known relationships.  For example
1179              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1180              * So
1181              *      :lower: & :print: = :lower:
1182              * And similarly for classes that must be disjoint.  For example,
1183              * since \s and \w can have no elements in common based on rules in
1184              * the POSIX standard,
1185              *      \w & ^\S = nothing
1186              * Unfortunately, some vendor locales do not meet the Posix
1187              * standard, in particular almost everything by Microsoft.
1188              * The loop below just changes e.g., \w into \W and vice versa */
1189
1190             regnode_charclass_posixl temp;
1191             int add = 1;    /* To calculate the index of the complement */
1192
1193             ANYOF_POSIXL_ZERO(&temp);
1194             for (i = 0; i < ANYOF_MAX; i++) {
1195                 assert(i % 2 != 0
1196                        || ! ANYOF_POSIXL_TEST(and_with, i)
1197                        || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1198
1199                 if (ANYOF_POSIXL_TEST(and_with, i)) {
1200                     ANYOF_POSIXL_SET(&temp, i + add);
1201                 }
1202                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1203             }
1204             ANYOF_POSIXL_AND(&temp, ssc);
1205
1206         } /* else ssc already has no posixes */
1207     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1208          in its initial state */
1209     else if (OP(and_with) != ANYOF_SYNTHETIC
1210              || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1211     {
1212         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1213          * copy it over 'ssc' */
1214         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1215             if (OP(and_with) == ANYOF_SYNTHETIC) {
1216                 StructCopy(and_with, ssc, regnode_ssc);
1217             }
1218             else {
1219                 ssc->invlist = anded_cp_list;
1220                 ANYOF_POSIXL_ZERO(ssc);
1221                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1222                     ANYOF_POSIXL_OR(and_with, ssc);
1223                 }
1224             }
1225         }
1226         else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1227                     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1228         {
1229             /* One or the other of P1, P2 is non-empty. */
1230             ANYOF_POSIXL_AND(and_with, ssc);
1231             ssc_union(ssc, anded_cp_list, FALSE);
1232         }
1233         else { /* P1 = P2 = empty */
1234             ssc_intersection(ssc, anded_cp_list, FALSE);
1235         }
1236     }
1237 }
1238
1239 STATIC void
1240 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1241                const regnode_ssc *or_with)
1242 {
1243     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1244      * another SSC or a regular ANYOF class.  Can create false positives if
1245      * 'or_with' is to be inverted. */
1246
1247     SV* ored_cp_list;
1248     U8 ored_flags;
1249
1250     PERL_ARGS_ASSERT_SSC_OR;
1251
1252     assert(OP(ssc) == ANYOF_SYNTHETIC);
1253
1254     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1255      * the code point inversion list and just the relevant flags */
1256     if (OP(or_with) == ANYOF_SYNTHETIC) {
1257         ored_cp_list = or_with->invlist;
1258         ored_flags = ANYOF_FLAGS(or_with);
1259     }
1260     else {
1261         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1262                                         (regnode_charclass_posixl*) or_with);
1263         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1264     }
1265
1266     ANYOF_FLAGS(ssc) |= ored_flags;
1267
1268     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1270      * 'or_with' may be inverted.  When not inverted, we have the simple
1271      * situation of computing:
1272      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1273      * If P1|P2 yields a situation with both a class and its complement are
1274      * set, like having both \w and \W, this matches all code points, and we
1275      * can delete these from the P component of the ssc going forward.  XXX We
1276      * might be able to delete all the P components, but I (khw) am not certain
1277      * about this, and it is better to be safe.
1278      *
1279      * Inverted, we have
1280      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1281      *                         <=  (C1 | P1) | ~C2
1282      *                         <=  (C1 | ~C2) | P1
1283      * (which results in actually simpler code than the non-inverted case)
1284      * */
1285
1286     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1287         && OP(or_with) != ANYOF_SYNTHETIC)
1288     {
1289         /* We ignore P2, leaving P1 going forward */
1290     }
1291     else {  /* Not inverted */
1292         ANYOF_POSIXL_OR(or_with, ssc);
1293         if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1294             unsigned int i;
1295             for (i = 0; i < ANYOF_MAX; i += 2) {
1296                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1297                 {
1298                     ssc_match_all_cp(ssc);
1299                     ANYOF_POSIXL_CLEAR(ssc, i);
1300                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1301                     if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1302                         ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1303                     }
1304                 }
1305             }
1306         }
1307     }
1308
1309     ssc_union(ssc,
1310               ored_cp_list,
1311               FALSE /* Already has been inverted */
1312               );
1313 }
1314
1315 PERL_STATIC_INLINE void
1316 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1317 {
1318     PERL_ARGS_ASSERT_SSC_UNION;
1319
1320     assert(OP(ssc) == ANYOF_SYNTHETIC);
1321
1322     _invlist_union_maybe_complement_2nd(ssc->invlist,
1323                                         invlist,
1324                                         invert2nd,
1325                                         &ssc->invlist);
1326 }
1327
1328 PERL_STATIC_INLINE void
1329 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1330                          SV* const invlist,
1331                          const bool invert2nd)
1332 {
1333     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1334
1335     assert(OP(ssc) == ANYOF_SYNTHETIC);
1336
1337     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1338                                                invlist,
1339                                                invert2nd,
1340                                                &ssc->invlist);
1341 }
1342
1343 PERL_STATIC_INLINE void
1344 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1345 {
1346     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1347
1348     assert(OP(ssc) == ANYOF_SYNTHETIC);
1349
1350     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1351 }
1352
1353 PERL_STATIC_INLINE void
1354 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1355 {
1356     /* AND just the single code point 'cp' into the SSC 'ssc' */
1357
1358     SV* cp_list = _new_invlist(2);
1359
1360     PERL_ARGS_ASSERT_SSC_CP_AND;
1361
1362     assert(OP(ssc) == ANYOF_SYNTHETIC);
1363
1364     cp_list = add_cp_to_invlist(cp_list, cp);
1365     ssc_intersection(ssc, cp_list,
1366                      FALSE /* Not inverted */
1367                      );
1368     SvREFCNT_dec_NN(cp_list);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1373 {
1374     /* Set the SSC 'ssc' to not match any locale things */
1375
1376     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1377
1378     assert(OP(ssc) == ANYOF_SYNTHETIC);
1379
1380     ANYOF_POSIXL_ZERO(ssc);
1381     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1382 }
1383
1384 STATIC void
1385 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1386 {
1387     /* The inversion list in the SSC is marked mortal; now we need a more
1388      * permanent copy, which is stored the same way that is done in a regular
1389      * ANYOF node, with the first 256 code points in a bit map */
1390
1391     SV* invlist = invlist_clone(ssc->invlist);
1392
1393     PERL_ARGS_ASSERT_SSC_FINALIZE;
1394
1395     assert(OP(ssc) == ANYOF_SYNTHETIC);
1396
1397     /* The code in this file assumes that all but these flags aren't relevant
1398      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1399      * time we reach here */
1400     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1401
1402     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1403
1404     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1405
1406     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1407 }
1408
1409 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1410 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1411 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1412 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1413
1414
1415 #ifdef DEBUGGING
1416 /*
1417    dump_trie(trie,widecharmap,revcharmap)
1418    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1419    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1420
1421    These routines dump out a trie in a somewhat readable format.
1422    The _interim_ variants are used for debugging the interim
1423    tables that are used to generate the final compressed
1424    representation which is what dump_trie expects.
1425
1426    Part of the reason for their existence is to provide a form
1427    of documentation as to how the different representations function.
1428
1429 */
1430
1431 /*
1432   Dumps the final compressed table form of the trie to Perl_debug_log.
1433   Used for debugging make_trie().
1434 */
1435
1436 STATIC void
1437 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1438             AV *revcharmap, U32 depth)
1439 {
1440     U32 state;
1441     SV *sv=sv_newmortal();
1442     int colwidth= widecharmap ? 6 : 4;
1443     U16 word;
1444     GET_RE_DEBUG_FLAGS_DECL;
1445
1446     PERL_ARGS_ASSERT_DUMP_TRIE;
1447
1448     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1449         (int)depth * 2 + 2,"",
1450         "Match","Base","Ofs" );
1451
1452     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1453         SV ** const tmp = av_fetch( revcharmap, state, 0);
1454         if ( tmp ) {
1455             PerlIO_printf( Perl_debug_log, "%*s",
1456                 colwidth,
1457                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1458                             PL_colors[0], PL_colors[1],
1459                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1460                             PERL_PV_ESCAPE_FIRSTCHAR
1461                 )
1462             );
1463         }
1464     }
1465     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1466         (int)depth * 2 + 2,"");
1467
1468     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1469         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1470     PerlIO_printf( Perl_debug_log, "\n");
1471
1472     for( state = 1 ; state < trie->statecount ; state++ ) {
1473         const U32 base = trie->states[ state ].trans.base;
1474
1475         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1476
1477         if ( trie->states[ state ].wordnum ) {
1478             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1479         } else {
1480             PerlIO_printf( Perl_debug_log, "%6s", "" );
1481         }
1482
1483         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1484
1485         if ( base ) {
1486             U32 ofs = 0;
1487
1488             while( ( base + ofs  < trie->uniquecharcount ) ||
1489                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1490                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1491                     ofs++;
1492
1493             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1494
1495             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1496                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1497                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1498                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1499                 {
1500                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1501                     colwidth,
1502                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1503                 } else {
1504                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1505                 }
1506             }
1507
1508             PerlIO_printf( Perl_debug_log, "]");
1509
1510         }
1511         PerlIO_printf( Perl_debug_log, "\n" );
1512     }
1513     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1514     for (word=1; word <= trie->wordcount; word++) {
1515         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1516             (int)word, (int)(trie->wordinfo[word].prev),
1517             (int)(trie->wordinfo[word].len));
1518     }
1519     PerlIO_printf(Perl_debug_log, "\n" );
1520 }
1521 /*
1522   Dumps a fully constructed but uncompressed trie in list form.
1523   List tries normally only are used for construction when the number of
1524   possible chars (trie->uniquecharcount) is very high.
1525   Used for debugging make_trie().
1526 */
1527 STATIC void
1528 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1529                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1530                          U32 depth)
1531 {
1532     U32 state;
1533     SV *sv=sv_newmortal();
1534     int colwidth= widecharmap ? 6 : 4;
1535     GET_RE_DEBUG_FLAGS_DECL;
1536
1537     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1538
1539     /* print out the table precompression.  */
1540     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1541         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1542         "------:-----+-----------------\n" );
1543
1544     for( state=1 ; state < next_alloc ; state ++ ) {
1545         U16 charid;
1546
1547         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1548             (int)depth * 2 + 2,"", (UV)state  );
1549         if ( ! trie->states[ state ].wordnum ) {
1550             PerlIO_printf( Perl_debug_log, "%5s| ","");
1551         } else {
1552             PerlIO_printf( Perl_debug_log, "W%4x| ",
1553                 trie->states[ state ].wordnum
1554             );
1555         }
1556         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1557             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1558             if ( tmp ) {
1559                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1560                     colwidth,
1561                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1562                             PL_colors[0], PL_colors[1],
1563                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1564                             PERL_PV_ESCAPE_FIRSTCHAR
1565                     ) ,
1566                     TRIE_LIST_ITEM(state,charid).forid,
1567                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1568                 );
1569                 if (!(charid % 10))
1570                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1571                         (int)((depth * 2) + 14), "");
1572             }
1573         }
1574         PerlIO_printf( Perl_debug_log, "\n");
1575     }
1576 }
1577
1578 /*
1579   Dumps a fully constructed but uncompressed trie in table form.
1580   This is the normal DFA style state transition table, with a few
1581   twists to facilitate compression later.
1582   Used for debugging make_trie().
1583 */
1584 STATIC void
1585 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1586                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1587                           U32 depth)
1588 {
1589     U32 state;
1590     U16 charid;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1596
1597     /*
1598        print out the table precompression so that we can do a visual check
1599        that they are identical.
1600      */
1601
1602     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1603
1604     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1605         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1606         if ( tmp ) {
1607             PerlIO_printf( Perl_debug_log, "%*s",
1608                 colwidth,
1609                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1610                             PL_colors[0], PL_colors[1],
1611                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1612                             PERL_PV_ESCAPE_FIRSTCHAR
1613                 )
1614             );
1615         }
1616     }
1617
1618     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1619
1620     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1621         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1622     }
1623
1624     PerlIO_printf( Perl_debug_log, "\n" );
1625
1626     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1627
1628         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1629             (int)depth * 2 + 2,"",
1630             (UV)TRIE_NODENUM( state ) );
1631
1632         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1633             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1634             if (v)
1635                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1636             else
1637                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1638         }
1639         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1640             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1641         } else {
1642             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1643             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1644         }
1645     }
1646 }
1647
1648 #endif
1649
1650
1651 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1652   startbranch: the first branch in the whole branch sequence
1653   first      : start branch of sequence of branch-exact nodes.
1654                May be the same as startbranch
1655   last       : Thing following the last branch.
1656                May be the same as tail.
1657   tail       : item following the branch sequence
1658   count      : words in the sequence
1659   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1660   depth      : indent depth
1661
1662 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1663
1664 A trie is an N'ary tree where the branches are determined by digital
1665 decomposition of the key. IE, at the root node you look up the 1st character and
1666 follow that branch repeat until you find the end of the branches. Nodes can be
1667 marked as "accepting" meaning they represent a complete word. Eg:
1668
1669   /he|she|his|hers/
1670
1671 would convert into the following structure. Numbers represent states, letters
1672 following numbers represent valid transitions on the letter from that state, if
1673 the number is in square brackets it represents an accepting state, otherwise it
1674 will be in parenthesis.
1675
1676       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1677       |    |
1678       |   (2)
1679       |    |
1680      (1)   +-i->(6)-+-s->[7]
1681       |
1682       +-s->(3)-+-h->(4)-+-e->[5]
1683
1684       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1685
1686 This shows that when matching against the string 'hers' we will begin at state 1
1687 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1688 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1689 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1690 single traverse. We store a mapping from accepting to state to which word was
1691 matched, and then when we have multiple possibilities we try to complete the
1692 rest of the regex in the order in which they occured in the alternation.
1693
1694 The only prior NFA like behaviour that would be changed by the TRIE support is
1695 the silent ignoring of duplicate alternations which are of the form:
1696
1697  / (DUPE|DUPE) X? (?{ ... }) Y /x
1698
1699 Thus EVAL blocks following a trie may be called a different number of times with
1700 and without the optimisation. With the optimisations dupes will be silently
1701 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1702 the following demonstrates:
1703
1704  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1705
1706 which prints out 'word' three times, but
1707
1708  'words'=~/(word|word|word)(?{ print $1 })S/
1709
1710 which doesnt print it out at all. This is due to other optimisations kicking in.
1711
1712 Example of what happens on a structural level:
1713
1714 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1715
1716    1: CURLYM[1] {1,32767}(18)
1717    5:   BRANCH(8)
1718    6:     EXACT <ac>(16)
1719    8:   BRANCH(11)
1720    9:     EXACT <ad>(16)
1721   11:   BRANCH(14)
1722   12:     EXACT <ab>(16)
1723   16:   SUCCEED(0)
1724   17:   NOTHING(18)
1725   18: END(0)
1726
1727 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1728 and should turn into:
1729
1730    1: CURLYM[1] {1,32767}(18)
1731    5:   TRIE(16)
1732         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1733           <ac>
1734           <ad>
1735           <ab>
1736   16:   SUCCEED(0)
1737   17:   NOTHING(18)
1738   18: END(0)
1739
1740 Cases where tail != last would be like /(?foo|bar)baz/:
1741
1742    1: BRANCH(4)
1743    2:   EXACT <foo>(8)
1744    4: BRANCH(7)
1745    5:   EXACT <bar>(8)
1746    7: TAIL(8)
1747    8: EXACT <baz>(10)
1748   10: END(0)
1749
1750 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1751 and would end up looking like:
1752
1753     1: TRIE(8)
1754       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1755         <foo>
1756         <bar>
1757    7: TAIL(8)
1758    8: EXACT <baz>(10)
1759   10: END(0)
1760
1761     d = uvchr_to_utf8_flags(d, uv, 0);
1762
1763 is the recommended Unicode-aware way of saying
1764
1765     *(d++) = uv;
1766 */
1767
1768 #define TRIE_STORE_REVCHAR(val)                                            \
1769     STMT_START {                                                           \
1770         if (UTF) {                                                         \
1771             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1772             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1773             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1774             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1775             SvPOK_on(zlopp);                                               \
1776             SvUTF8_on(zlopp);                                              \
1777             av_push(revcharmap, zlopp);                                    \
1778         } else {                                                           \
1779             char ooooff = (char)val;                                           \
1780             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1781         }                                                                  \
1782         } STMT_END
1783
1784 /* This gets the next character from the input, folding it if not already
1785  * folded. */
1786 #define TRIE_READ_CHAR STMT_START {                                           \
1787     wordlen++;                                                                \
1788     if ( UTF ) {                                                              \
1789         /* if it is UTF then it is either already folded, or does not need    \
1790          * folding */                                                         \
1791         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1792     }                                                                         \
1793     else if (folder == PL_fold_latin1) {                                      \
1794         /* This folder implies Unicode rules, which in the range expressible  \
1795          *  by not UTF is the lower case, with the two exceptions, one of     \
1796          *  which should have been taken care of before calling this */       \
1797         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1798         uvc = toLOWER_L1(*uc);                                                \
1799         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1800         len = 1;                                                              \
1801     } else {                                                                  \
1802         /* raw data, will be folded later if needed */                        \
1803         uvc = (U32)*uc;                                                       \
1804         len = 1;                                                              \
1805     }                                                                         \
1806 } STMT_END
1807
1808
1809
1810 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1811     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1812         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1813         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1814     }                                                           \
1815     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1816     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1817     TRIE_LIST_CUR( state )++;                                   \
1818 } STMT_END
1819
1820 #define TRIE_LIST_NEW(state) STMT_START {                       \
1821     Newxz( trie->states[ state ].trans.list,               \
1822         4, reg_trie_trans_le );                                 \
1823      TRIE_LIST_CUR( state ) = 1;                                \
1824      TRIE_LIST_LEN( state ) = 4;                                \
1825 } STMT_END
1826
1827 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1828     U16 dupe= trie->states[ state ].wordnum;                    \
1829     regnode * const noper_next = regnext( noper );              \
1830                                                                 \
1831     DEBUG_r({                                                   \
1832         /* store the word for dumping */                        \
1833         SV* tmp;                                                \
1834         if (OP(noper) != NOTHING)                               \
1835             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1836         else                                                    \
1837             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1838         av_push( trie_words, tmp );                             \
1839     });                                                         \
1840                                                                 \
1841     curword++;                                                  \
1842     trie->wordinfo[curword].prev   = 0;                         \
1843     trie->wordinfo[curword].len    = wordlen;                   \
1844     trie->wordinfo[curword].accept = state;                     \
1845                                                                 \
1846     if ( noper_next < tail ) {                                  \
1847         if (!trie->jump)                                        \
1848             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1849         trie->jump[curword] = (U16)(noper_next - convert);      \
1850         if (!jumper)                                            \
1851             jumper = noper_next;                                \
1852         if (!nextbranch)                                        \
1853             nextbranch= regnext(cur);                           \
1854     }                                                           \
1855                                                                 \
1856     if ( dupe ) {                                               \
1857         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1858         /* chain, so that when the bits of chain are later    */\
1859         /* linked together, the dups appear in the chain      */\
1860         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1861         trie->wordinfo[dupe].prev = curword;                    \
1862     } else {                                                    \
1863         /* we haven't inserted this word yet.                */ \
1864         trie->states[ state ].wordnum = curword;                \
1865     }                                                           \
1866 } STMT_END
1867
1868
1869 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1870      ( ( base + charid >=  ucharcount                                   \
1871          && base + charid < ubound                                      \
1872          && state == trie->trans[ base - ucharcount + charid ].check    \
1873          && trie->trans[ base - ucharcount + charid ].next )            \
1874            ? trie->trans[ base - ucharcount + charid ].next             \
1875            : ( state==1 ? special : 0 )                                 \
1876       )
1877
1878 #define MADE_TRIE       1
1879 #define MADE_JUMP_TRIE  2
1880 #define MADE_EXACT_TRIE 4
1881
1882 STATIC I32
1883 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1884 {
1885     dVAR;
1886     /* first pass, loop through and scan words */
1887     reg_trie_data *trie;
1888     HV *widecharmap = NULL;
1889     AV *revcharmap = newAV();
1890     regnode *cur;
1891     STRLEN len = 0;
1892     UV uvc = 0;
1893     U16 curword = 0;
1894     U32 next_alloc = 0;
1895     regnode *jumper = NULL;
1896     regnode *nextbranch = NULL;
1897     regnode *convert = NULL;
1898     U32 *prev_states; /* temp array mapping each state to previous one */
1899     /* we just use folder as a flag in utf8 */
1900     const U8 * folder = NULL;
1901
1902 #ifdef DEBUGGING
1903     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1904     AV *trie_words = NULL;
1905     /* along with revcharmap, this only used during construction but both are
1906      * useful during debugging so we store them in the struct when debugging.
1907      */
1908 #else
1909     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1910     STRLEN trie_charcount=0;
1911 #endif
1912     SV *re_trie_maxbuff;
1913     GET_RE_DEBUG_FLAGS_DECL;
1914
1915     PERL_ARGS_ASSERT_MAKE_TRIE;
1916 #ifndef DEBUGGING
1917     PERL_UNUSED_ARG(depth);
1918 #endif
1919
1920     switch (flags) {
1921         case EXACT: break;
1922         case EXACTFA:
1923         case EXACTFU_SS:
1924         case EXACTFU: folder = PL_fold_latin1; break;
1925         case EXACTF:  folder = PL_fold; break;
1926         case EXACTFL: folder = PL_fold_locale; break;
1927         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1928     }
1929
1930     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1931     trie->refcount = 1;
1932     trie->startstate = 1;
1933     trie->wordcount = word_count;
1934     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1935     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1936     if (flags == EXACT)
1937         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1938     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1939                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1940
1941     DEBUG_r({
1942         trie_words = newAV();
1943     });
1944
1945     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1946     if (!SvIOK(re_trie_maxbuff)) {
1947         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1948     }
1949     DEBUG_TRIE_COMPILE_r({
1950                 PerlIO_printf( Perl_debug_log,
1951                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1952                   (int)depth * 2 + 2, "",
1953                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1954                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1955                   (int)depth);
1956     });
1957
1958    /* Find the node we are going to overwrite */
1959     if ( first == startbranch && OP( last ) != BRANCH ) {
1960         /* whole branch chain */
1961         convert = first;
1962     } else {
1963         /* branch sub-chain */
1964         convert = NEXTOPER( first );
1965     }
1966
1967     /*  -- First loop and Setup --
1968
1969        We first traverse the branches and scan each word to determine if it
1970        contains widechars, and how many unique chars there are, this is
1971        important as we have to build a table with at least as many columns as we
1972        have unique chars.
1973
1974        We use an array of integers to represent the character codes 0..255
1975        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1976        native representation of the character value as the key and IV's for the
1977        coded index.
1978
1979        *TODO* If we keep track of how many times each character is used we can
1980        remap the columns so that the table compression later on is more
1981        efficient in terms of memory by ensuring the most common value is in the
1982        middle and the least common are on the outside.  IMO this would be better
1983        than a most to least common mapping as theres a decent chance the most
1984        common letter will share a node with the least common, meaning the node
1985        will not be compressible. With a middle is most common approach the worst
1986        case is when we have the least common nodes twice.
1987
1988      */
1989
1990     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991         regnode *noper = NEXTOPER( cur );
1992         const U8 *uc = (U8*)STRING( noper );
1993         const U8 *e  = uc + STR_LEN( noper );
1994         STRLEN foldlen = 0;
1995         U32 wordlen      = 0;         /* required init */
1996         STRLEN minbytes = 0;
1997         STRLEN maxbytes = 0;
1998         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1999
2000         if (OP(noper) == NOTHING) {
2001             regnode *noper_next= regnext(noper);
2002             if (noper_next != tail && OP(noper_next) == flags) {
2003                 noper = noper_next;
2004                 uc= (U8*)STRING(noper);
2005                 e= uc + STR_LEN(noper);
2006                 trie->minlen= STR_LEN(noper);
2007             } else {
2008                 trie->minlen= 0;
2009                 continue;
2010             }
2011         }
2012
2013         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2014             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2015                                           regardless of encoding */
2016             if (OP( noper ) == EXACTFU_SS) {
2017                 /* false positives are ok, so just set this */
2018                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2019             }
2020         }
2021         for ( ; uc < e ; uc += len ) {
2022             TRIE_CHARCOUNT(trie)++;
2023             TRIE_READ_CHAR;
2024
2025             /* Acummulate to the current values, the range in the number of
2026              * bytes that this character could match.  The max is presumed to
2027              * be the same as the folded input (which TRIE_READ_CHAR returns),
2028              * except that when this is not in UTF-8, it could be matched
2029              * against a string which is UTF-8, and the variant characters
2030              * could be 2 bytes instead of the 1 here.  Likewise, for the
2031              * minimum number of bytes when not folded.  When folding, the min
2032              * is assumed to be 1 byte could fold to match the single character
2033              * here, or in the case of a multi-char fold, 1 byte can fold to
2034              * the whole sequence.  'foldlen' is used to denote whether we are
2035              * in such a sequence, skipping the min setting if so.  XXX TODO
2036              * Use the exact list of what folds to each character, from
2037              * PL_utf8_foldclosures */
2038             if (UTF) {
2039                 maxbytes += UTF8SKIP(uc);
2040                 if (! folder) {
2041                     /* A non-UTF-8 string could be 1 byte to match our 2 */
2042                     minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2043                                 ? 1
2044                                 : UTF8SKIP(uc);
2045                 }
2046                 else {
2047                     if (foldlen) {
2048                         foldlen -= UTF8SKIP(uc);
2049                     }
2050                     else {
2051                         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2052                         minbytes++;
2053                     }
2054                 }
2055             }
2056             else {
2057                 maxbytes += (UNI_IS_INVARIANT(*uc))
2058                              ? 1
2059                              : 2;
2060                 if (! folder) {
2061                     minbytes++;
2062                 }
2063                 else {
2064                     if (foldlen) {
2065                         foldlen--;
2066                     }
2067                     else {
2068                         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2069                         minbytes++;
2070                     }
2071                 }
2072             }
2073             if ( uvc < 256 ) {
2074                 if ( folder ) {
2075                     U8 folded= folder[ (U8) uvc ];
2076                     if ( !trie->charmap[ folded ] ) {
2077                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2078                         TRIE_STORE_REVCHAR( folded );
2079                     }
2080                 }
2081                 if ( !trie->charmap[ uvc ] ) {
2082                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2083                     TRIE_STORE_REVCHAR( uvc );
2084                 }
2085                 if ( set_bit ) {
2086                     /* store the codepoint in the bitmap, and its folded
2087                      * equivalent. */
2088                     TRIE_BITMAP_SET(trie, uvc);
2089
2090                     /* store the folded codepoint */
2091                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2092
2093                     if ( !UTF ) {
2094                         /* store first byte of utf8 representation of
2095                            variant codepoints */
2096                         if (! UVCHR_IS_INVARIANT(uvc)) {
2097                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2098                         }
2099                     }
2100                     set_bit = 0; /* We've done our bit :-) */
2101                 }
2102             } else {
2103                 SV** svpp;
2104                 if ( !widecharmap )
2105                     widecharmap = newHV();
2106
2107                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2108
2109                 if ( !svpp )
2110                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2111
2112                 if ( !SvTRUE( *svpp ) ) {
2113                     sv_setiv( *svpp, ++trie->uniquecharcount );
2114                     TRIE_STORE_REVCHAR(uvc);
2115                 }
2116             }
2117         }
2118         if( cur == first ) {
2119             trie->minlen = minbytes;
2120             trie->maxlen = maxbytes;
2121         } else if (minbytes < trie->minlen) {
2122             trie->minlen = minbytes;
2123         } else if (maxbytes > trie->maxlen) {
2124             trie->maxlen = maxbytes;
2125         }
2126     } /* end first pass */
2127     DEBUG_TRIE_COMPILE_r(
2128         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2129                 (int)depth * 2 + 2,"",
2130                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2131                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2132                 (int)trie->minlen, (int)trie->maxlen )
2133     );
2134
2135     /*
2136         We now know what we are dealing with in terms of unique chars and
2137         string sizes so we can calculate how much memory a naive
2138         representation using a flat table  will take. If it's over a reasonable
2139         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2140         conservative but potentially much slower representation using an array
2141         of lists.
2142
2143         At the end we convert both representations into the same compressed
2144         form that will be used in regexec.c for matching with. The latter
2145         is a form that cannot be used to construct with but has memory
2146         properties similar to the list form and access properties similar
2147         to the table form making it both suitable for fast searches and
2148         small enough that its feasable to store for the duration of a program.
2149
2150         See the comment in the code where the compressed table is produced
2151         inplace from the flat tabe representation for an explanation of how
2152         the compression works.
2153
2154     */
2155
2156
2157     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2158     prev_states[1] = 0;
2159
2160     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2161         /*
2162             Second Pass -- Array Of Lists Representation
2163
2164             Each state will be represented by a list of charid:state records
2165             (reg_trie_trans_le) the first such element holds the CUR and LEN
2166             points of the allocated array. (See defines above).
2167
2168             We build the initial structure using the lists, and then convert
2169             it into the compressed table form which allows faster lookups
2170             (but cant be modified once converted).
2171         */
2172
2173         STRLEN transcount = 1;
2174
2175         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2176             "%*sCompiling trie using list compiler\n",
2177             (int)depth * 2 + 2, ""));
2178
2179         trie->states = (reg_trie_state *)
2180             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2181                                   sizeof(reg_trie_state) );
2182         TRIE_LIST_NEW(1);
2183         next_alloc = 2;
2184
2185         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2186
2187             regnode *noper   = NEXTOPER( cur );
2188             U8 *uc           = (U8*)STRING( noper );
2189             const U8 *e      = uc + STR_LEN( noper );
2190             U32 state        = 1;         /* required init */
2191             U16 charid       = 0;         /* sanity init */
2192             U32 wordlen      = 0;         /* required init */
2193
2194             if (OP(noper) == NOTHING) {
2195                 regnode *noper_next= regnext(noper);
2196                 if (noper_next != tail && OP(noper_next) == flags) {
2197                     noper = noper_next;
2198                     uc= (U8*)STRING(noper);
2199                     e= uc + STR_LEN(noper);
2200                 }
2201             }
2202
2203             if (OP(noper) != NOTHING) {
2204                 for ( ; uc < e ; uc += len ) {
2205
2206                     TRIE_READ_CHAR;
2207
2208                     if ( uvc < 256 ) {
2209                         charid = trie->charmap[ uvc ];
2210                     } else {
2211                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2212                         if ( !svpp ) {
2213                             charid = 0;
2214                         } else {
2215                             charid=(U16)SvIV( *svpp );
2216                         }
2217                     }
2218                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2219                     if ( charid ) {
2220
2221                         U16 check;
2222                         U32 newstate = 0;
2223
2224                         charid--;
2225                         if ( !trie->states[ state ].trans.list ) {
2226                             TRIE_LIST_NEW( state );
2227                         }
2228                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2229                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2230                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2231                                 break;
2232                             }
2233                         }
2234                         if ( ! newstate ) {
2235                             newstate = next_alloc++;
2236                             prev_states[newstate] = state;
2237                             TRIE_LIST_PUSH( state, charid, newstate );
2238                             transcount++;
2239                         }
2240                         state = newstate;
2241                     } else {
2242                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2243                     }
2244                 }
2245             }
2246             TRIE_HANDLE_WORD(state);
2247
2248         } /* end second pass */
2249
2250         /* next alloc is the NEXT state to be allocated */
2251         trie->statecount = next_alloc;
2252         trie->states = (reg_trie_state *)
2253             PerlMemShared_realloc( trie->states,
2254                                    next_alloc
2255                                    * sizeof(reg_trie_state) );
2256
2257         /* and now dump it out before we compress it */
2258         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2259                                                          revcharmap, next_alloc,
2260                                                          depth+1)
2261         );
2262
2263         trie->trans = (reg_trie_trans *)
2264             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2265         {
2266             U32 state;
2267             U32 tp = 0;
2268             U32 zp = 0;
2269
2270
2271             for( state=1 ; state < next_alloc ; state ++ ) {
2272                 U32 base=0;
2273
2274                 /*
2275                 DEBUG_TRIE_COMPILE_MORE_r(
2276                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2277                 );
2278                 */
2279
2280                 if (trie->states[state].trans.list) {
2281                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2282                     U16 maxid=minid;
2283                     U16 idx;
2284
2285                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2286                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2287                         if ( forid < minid ) {
2288                             minid=forid;
2289                         } else if ( forid > maxid ) {
2290                             maxid=forid;
2291                         }
2292                     }
2293                     if ( transcount < tp + maxid - minid + 1) {
2294                         transcount *= 2;
2295                         trie->trans = (reg_trie_trans *)
2296                             PerlMemShared_realloc( trie->trans,
2297                                                      transcount
2298                                                      * sizeof(reg_trie_trans) );
2299                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2300                     }
2301                     base = trie->uniquecharcount + tp - minid;
2302                     if ( maxid == minid ) {
2303                         U32 set = 0;
2304                         for ( ; zp < tp ; zp++ ) {
2305                             if ( ! trie->trans[ zp ].next ) {
2306                                 base = trie->uniquecharcount + zp - minid;
2307                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2308                                 trie->trans[ zp ].check = state;
2309                                 set = 1;
2310                                 break;
2311                             }
2312                         }
2313                         if ( !set ) {
2314                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2315                             trie->trans[ tp ].check = state;
2316                             tp++;
2317                             zp = tp;
2318                         }
2319                     } else {
2320                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2321                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2322                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2323                             trie->trans[ tid ].check = state;
2324                         }
2325                         tp += ( maxid - minid + 1 );
2326                     }
2327                     Safefree(trie->states[ state ].trans.list);
2328                 }
2329                 /*
2330                 DEBUG_TRIE_COMPILE_MORE_r(
2331                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2332                 );
2333                 */
2334                 trie->states[ state ].trans.base=base;
2335             }
2336             trie->lasttrans = tp + 1;
2337         }
2338     } else {
2339         /*
2340            Second Pass -- Flat Table Representation.
2341
2342            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2343            each.  We know that we will need Charcount+1 trans at most to store
2344            the data (one row per char at worst case) So we preallocate both
2345            structures assuming worst case.
2346
2347            We then construct the trie using only the .next slots of the entry
2348            structs.
2349
2350            We use the .check field of the first entry of the node temporarily
2351            to make compression both faster and easier by keeping track of how
2352            many non zero fields are in the node.
2353
2354            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2355            transition.
2356
2357            There are two terms at use here: state as a TRIE_NODEIDX() which is
2358            a number representing the first entry of the node, and state as a
2359            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2360            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2361            if there are 2 entrys per node. eg:
2362
2363              A B       A B
2364           1. 2 4    1. 3 7
2365           2. 0 3    3. 0 5
2366           3. 0 0    5. 0 0
2367           4. 0 0    7. 0 0
2368
2369            The table is internally in the right hand, idx form. However as we
2370            also have to deal with the states array which is indexed by nodenum
2371            we have to use TRIE_NODENUM() to convert.
2372
2373         */
2374         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2375             "%*sCompiling trie using table compiler\n",
2376             (int)depth * 2 + 2, ""));
2377
2378         trie->trans = (reg_trie_trans *)
2379             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2380                                   * trie->uniquecharcount + 1,
2381                                   sizeof(reg_trie_trans) );
2382         trie->states = (reg_trie_state *)
2383             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2384                                   sizeof(reg_trie_state) );
2385         next_alloc = trie->uniquecharcount + 1;
2386
2387
2388         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2389
2390             regnode *noper   = NEXTOPER( cur );
2391             const U8 *uc     = (U8*)STRING( noper );
2392             const U8 *e      = uc + STR_LEN( noper );
2393
2394             U32 state        = 1;         /* required init */
2395
2396             U16 charid       = 0;         /* sanity init */
2397             U32 accept_state = 0;         /* sanity init */
2398
2399             U32 wordlen      = 0;         /* required init */
2400
2401             if (OP(noper) == NOTHING) {
2402                 regnode *noper_next= regnext(noper);
2403                 if (noper_next != tail && OP(noper_next) == flags) {
2404                     noper = noper_next;
2405                     uc= (U8*)STRING(noper);
2406                     e= uc + STR_LEN(noper);
2407                 }
2408             }
2409
2410             if ( OP(noper) != NOTHING ) {
2411                 for ( ; uc < e ; uc += len ) {
2412
2413                     TRIE_READ_CHAR;
2414
2415                     if ( uvc < 256 ) {
2416                         charid = trie->charmap[ uvc ];
2417                     } else {
2418                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2419                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2420                     }
2421                     if ( charid ) {
2422                         charid--;
2423                         if ( !trie->trans[ state + charid ].next ) {
2424                             trie->trans[ state + charid ].next = next_alloc;
2425                             trie->trans[ state ].check++;
2426                             prev_states[TRIE_NODENUM(next_alloc)]
2427                                     = TRIE_NODENUM(state);
2428                             next_alloc += trie->uniquecharcount;
2429                         }
2430                         state = trie->trans[ state + charid ].next;
2431                     } else {
2432                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2433                     }
2434                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2435                 }
2436             }
2437             accept_state = TRIE_NODENUM( state );
2438             TRIE_HANDLE_WORD(accept_state);
2439
2440         } /* end second pass */
2441
2442         /* and now dump it out before we compress it */
2443         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2444                                                           revcharmap,
2445                                                           next_alloc, depth+1));
2446
2447         {
2448         /*
2449            * Inplace compress the table.*
2450
2451            For sparse data sets the table constructed by the trie algorithm will
2452            be mostly 0/FAIL transitions or to put it another way mostly empty.
2453            (Note that leaf nodes will not contain any transitions.)
2454
2455            This algorithm compresses the tables by eliminating most such
2456            transitions, at the cost of a modest bit of extra work during lookup:
2457
2458            - Each states[] entry contains a .base field which indicates the
2459            index in the state[] array wheres its transition data is stored.
2460
2461            - If .base is 0 there are no valid transitions from that node.
2462
2463            - If .base is nonzero then charid is added to it to find an entry in
2464            the trans array.
2465
2466            -If trans[states[state].base+charid].check!=state then the
2467            transition is taken to be a 0/Fail transition. Thus if there are fail
2468            transitions at the front of the node then the .base offset will point
2469            somewhere inside the previous nodes data (or maybe even into a node
2470            even earlier), but the .check field determines if the transition is
2471            valid.
2472
2473            XXX - wrong maybe?
2474            The following process inplace converts the table to the compressed
2475            table: We first do not compress the root node 1,and mark all its
2476            .check pointers as 1 and set its .base pointer as 1 as well. This
2477            allows us to do a DFA construction from the compressed table later,
2478            and ensures that any .base pointers we calculate later are greater
2479            than 0.
2480
2481            - We set 'pos' to indicate the first entry of the second node.
2482
2483            - We then iterate over the columns of the node, finding the first and
2484            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2485            and set the .check pointers accordingly, and advance pos
2486            appropriately and repreat for the next node. Note that when we copy
2487            the next pointers we have to convert them from the original
2488            NODEIDX form to NODENUM form as the former is not valid post
2489            compression.
2490
2491            - If a node has no transitions used we mark its base as 0 and do not
2492            advance the pos pointer.
2493
2494            - If a node only has one transition we use a second pointer into the
2495            structure to fill in allocated fail transitions from other states.
2496            This pointer is independent of the main pointer and scans forward
2497            looking for null transitions that are allocated to a state. When it
2498            finds one it writes the single transition into the "hole".  If the
2499            pointer doesnt find one the single transition is appended as normal.
2500
2501            - Once compressed we can Renew/realloc the structures to release the
2502            excess space.
2503
2504            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2505            specifically Fig 3.47 and the associated pseudocode.
2506
2507            demq
2508         */
2509         const U32 laststate = TRIE_NODENUM( next_alloc );
2510         U32 state, charid;
2511         U32 pos = 0, zp=0;
2512         trie->statecount = laststate;
2513
2514         for ( state = 1 ; state < laststate ; state++ ) {
2515             U8 flag = 0;
2516             const U32 stateidx = TRIE_NODEIDX( state );
2517             const U32 o_used = trie->trans[ stateidx ].check;
2518             U32 used = trie->trans[ stateidx ].check;
2519             trie->trans[ stateidx ].check = 0;
2520
2521             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2522                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2523                     if ( trie->trans[ stateidx + charid ].next ) {
2524                         if (o_used == 1) {
2525                             for ( ; zp < pos ; zp++ ) {
2526                                 if ( ! trie->trans[ zp ].next ) {
2527                                     break;
2528                                 }
2529                             }
2530                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2531                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2532                             trie->trans[ zp ].check = state;
2533                             if ( ++zp > pos ) pos = zp;
2534                             break;
2535                         }
2536                         used--;
2537                     }
2538                     if ( !flag ) {
2539                         flag = 1;
2540                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2541                     }
2542                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2543                     trie->trans[ pos ].check = state;
2544                     pos++;
2545                 }
2546             }
2547         }
2548         trie->lasttrans = pos + 1;
2549         trie->states = (reg_trie_state *)
2550             PerlMemShared_realloc( trie->states, laststate
2551                                    * sizeof(reg_trie_state) );
2552         DEBUG_TRIE_COMPILE_MORE_r(
2553                 PerlIO_printf( Perl_debug_log,
2554                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2555                     (int)depth * 2 + 2,"",
2556                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2557                     (IV)next_alloc,
2558                     (IV)pos,
2559                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2560             );
2561
2562         } /* end table compress */
2563     }
2564     DEBUG_TRIE_COMPILE_MORE_r(
2565             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2566                 (int)depth * 2 + 2, "",
2567                 (UV)trie->statecount,
2568                 (UV)trie->lasttrans)
2569     );
2570     /* resize the trans array to remove unused space */
2571     trie->trans = (reg_trie_trans *)
2572         PerlMemShared_realloc( trie->trans, trie->lasttrans
2573                                * sizeof(reg_trie_trans) );
2574
2575     {   /* Modify the program and insert the new TRIE node */
2576         U8 nodetype =(U8)(flags & 0xFF);
2577         char *str=NULL;
2578
2579 #ifdef DEBUGGING
2580         regnode *optimize = NULL;
2581 #ifdef RE_TRACK_PATTERN_OFFSETS
2582
2583         U32 mjd_offset = 0;
2584         U32 mjd_nodelen = 0;
2585 #endif /* RE_TRACK_PATTERN_OFFSETS */
2586 #endif /* DEBUGGING */
2587         /*
2588            This means we convert either the first branch or the first Exact,
2589            depending on whether the thing following (in 'last') is a branch
2590            or not and whther first is the startbranch (ie is it a sub part of
2591            the alternation or is it the whole thing.)
2592            Assuming its a sub part we convert the EXACT otherwise we convert
2593            the whole branch sequence, including the first.
2594          */
2595         /* Find the node we are going to overwrite */
2596         if ( first != startbranch || OP( last ) == BRANCH ) {
2597             /* branch sub-chain */
2598             NEXT_OFF( first ) = (U16)(last - first);
2599 #ifdef RE_TRACK_PATTERN_OFFSETS
2600             DEBUG_r({
2601                 mjd_offset= Node_Offset((convert));
2602                 mjd_nodelen= Node_Length((convert));
2603             });
2604 #endif
2605             /* whole branch chain */
2606         }
2607 #ifdef RE_TRACK_PATTERN_OFFSETS
2608         else {
2609             DEBUG_r({
2610                 const  regnode *nop = NEXTOPER( convert );
2611                 mjd_offset= Node_Offset((nop));
2612                 mjd_nodelen= Node_Length((nop));
2613             });
2614         }
2615         DEBUG_OPTIMISE_r(
2616             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2617                 (int)depth * 2 + 2, "",
2618                 (UV)mjd_offset, (UV)mjd_nodelen)
2619         );
2620 #endif
2621         /* But first we check to see if there is a common prefix we can
2622            split out as an EXACT and put in front of the TRIE node.  */
2623         trie->startstate= 1;
2624         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2625             U32 state;
2626             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2627                 U32 ofs = 0;
2628                 I32 idx = -1;
2629                 U32 count = 0;
2630                 const U32 base = trie->states[ state ].trans.base;
2631
2632                 if ( trie->states[state].wordnum )
2633                         count = 1;
2634
2635                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2636                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2637                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2638                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2639                     {
2640                         if ( ++count > 1 ) {
2641                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2642                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2643                             if ( state == 1 ) break;
2644                             if ( count == 2 ) {
2645                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2646                                 DEBUG_OPTIMISE_r(
2647                                     PerlIO_printf(Perl_debug_log,
2648                                         "%*sNew Start State=%"UVuf" Class: [",
2649                                         (int)depth * 2 + 2, "",
2650                                         (UV)state));
2651                                 if (idx >= 0) {
2652                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2653                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2654
2655                                     TRIE_BITMAP_SET(trie,*ch);
2656                                     if ( folder )
2657                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2658                                     DEBUG_OPTIMISE_r(
2659                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2660                                     );
2661                                 }
2662                             }
2663                             TRIE_BITMAP_SET(trie,*ch);
2664                             if ( folder )
2665                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2666                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2667                         }
2668                         idx = ofs;
2669                     }
2670                 }
2671                 if ( count == 1 ) {
2672                     SV **tmp = av_fetch( revcharmap, idx, 0);
2673                     STRLEN len;
2674                     char *ch = SvPV( *tmp, len );
2675                     DEBUG_OPTIMISE_r({
2676                         SV *sv=sv_newmortal();
2677                         PerlIO_printf( Perl_debug_log,
2678                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2679                             (int)depth * 2 + 2, "",
2680                             (UV)state, (UV)idx,
2681                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2682                                 PL_colors[0], PL_colors[1],
2683                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2684                                 PERL_PV_ESCAPE_FIRSTCHAR
2685                             )
2686                         );
2687                     });
2688                     if ( state==1 ) {
2689                         OP( convert ) = nodetype;
2690                         str=STRING(convert);
2691                         STR_LEN(convert)=0;
2692                     }
2693                     STR_LEN(convert) += len;
2694                     while (len--)
2695                         *str++ = *ch++;
2696                 } else {
2697 #ifdef DEBUGGING
2698                     if (state>1)
2699                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2700 #endif
2701                     break;
2702                 }
2703             }
2704             trie->prefixlen = (state-1);
2705             if (str) {
2706                 regnode *n = convert+NODE_SZ_STR(convert);
2707                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2708                 trie->startstate = state;
2709                 trie->minlen -= (state - 1);
2710                 trie->maxlen -= (state - 1);
2711 #ifdef DEBUGGING
2712                /* At least the UNICOS C compiler choked on this
2713                 * being argument to DEBUG_r(), so let's just have
2714                 * it right here. */
2715                if (
2716 #ifdef PERL_EXT_RE_BUILD
2717                    1
2718 #else
2719                    DEBUG_r_TEST
2720 #endif
2721                    ) {
2722                    regnode *fix = convert;
2723                    U32 word = trie->wordcount;
2724                    mjd_nodelen++;
2725                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2726                    while( ++fix < n ) {
2727                        Set_Node_Offset_Length(fix, 0, 0);
2728                    }
2729                    while (word--) {
2730                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2731                        if (tmp) {
2732                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2733                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2734                            else
2735                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2736                        }
2737                    }
2738                }
2739 #endif
2740                 if (trie->maxlen) {
2741                     convert = n;
2742                 } else {
2743                     NEXT_OFF(convert) = (U16)(tail - convert);
2744                     DEBUG_r(optimize= n);
2745                 }
2746             }
2747         }
2748         if (!jumper)
2749             jumper = last;
2750         if ( trie->maxlen ) {
2751             NEXT_OFF( convert ) = (U16)(tail - convert);
2752             ARG_SET( convert, data_slot );
2753             /* Store the offset to the first unabsorbed branch in
2754                jump[0], which is otherwise unused by the jump logic.
2755                We use this when dumping a trie and during optimisation. */
2756             if (trie->jump)
2757                 trie->jump[0] = (U16)(nextbranch - convert);
2758
2759             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2760              *   and there is a bitmap
2761              *   and the first "jump target" node we found leaves enough room
2762              * then convert the TRIE node into a TRIEC node, with the bitmap
2763              * embedded inline in the opcode - this is hypothetically faster.
2764              */
2765             if ( !trie->states[trie->startstate].wordnum
2766                  && trie->bitmap
2767                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2768             {
2769                 OP( convert ) = TRIEC;
2770                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2771                 PerlMemShared_free(trie->bitmap);
2772                 trie->bitmap= NULL;
2773             } else
2774                 OP( convert ) = TRIE;
2775
2776             /* store the type in the flags */
2777             convert->flags = nodetype;
2778             DEBUG_r({
2779             optimize = convert
2780                       + NODE_STEP_REGNODE
2781                       + regarglen[ OP( convert ) ];
2782             });
2783             /* XXX We really should free up the resource in trie now,
2784                    as we won't use them - (which resources?) dmq */
2785         }
2786         /* needed for dumping*/
2787         DEBUG_r(if (optimize) {
2788             regnode *opt = convert;
2789
2790             while ( ++opt < optimize) {
2791                 Set_Node_Offset_Length(opt,0,0);
2792             }
2793             /*
2794                 Try to clean up some of the debris left after the
2795                 optimisation.
2796              */
2797             while( optimize < jumper ) {
2798                 mjd_nodelen += Node_Length((optimize));
2799                 OP( optimize ) = OPTIMIZED;
2800                 Set_Node_Offset_Length(optimize,0,0);
2801                 optimize++;
2802             }
2803             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2804         });
2805     } /* end node insert */
2806
2807     /*  Finish populating the prev field of the wordinfo array.  Walk back
2808      *  from each accept state until we find another accept state, and if
2809      *  so, point the first word's .prev field at the second word. If the
2810      *  second already has a .prev field set, stop now. This will be the
2811      *  case either if we've already processed that word's accept state,
2812      *  or that state had multiple words, and the overspill words were
2813      *  already linked up earlier.
2814      */
2815     {
2816         U16 word;
2817         U32 state;
2818         U16 prev;
2819
2820         for (word=1; word <= trie->wordcount; word++) {
2821             prev = 0;
2822             if (trie->wordinfo[word].prev)
2823                 continue;
2824             state = trie->wordinfo[word].accept;
2825             while (state) {
2826                 state = prev_states[state];
2827                 if (!state)
2828                     break;
2829                 prev = trie->states[state].wordnum;
2830                 if (prev)
2831                     break;
2832             }
2833             trie->wordinfo[word].prev = prev;
2834         }
2835         Safefree(prev_states);
2836     }
2837
2838
2839     /* and now dump out the compressed format */
2840     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2841
2842     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2843 #ifdef DEBUGGING
2844     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2845     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2846 #else
2847     SvREFCNT_dec_NN(revcharmap);
2848 #endif
2849     return trie->jump
2850            ? MADE_JUMP_TRIE
2851            : trie->startstate>1
2852              ? MADE_EXACT_TRIE
2853              : MADE_TRIE;
2854 }
2855
2856 STATIC void
2857 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2858 {
2859 /* The Trie is constructed and compressed now so we can build a fail array if
2860  * it's needed
2861
2862    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2863    3.32 in the
2864    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2865    Ullman 1985/88
2866    ISBN 0-201-10088-6
2867
2868    We find the fail state for each state in the trie, this state is the longest
2869    proper suffix of the current state's 'word' that is also a proper prefix of
2870    another word in our trie. State 1 represents the word '' and is thus the
2871    default fail state. This allows the DFA not to have to restart after its
2872    tried and failed a word at a given point, it simply continues as though it
2873    had been matching the other word in the first place.
2874    Consider
2875       'abcdgu'=~/abcdefg|cdgu/
2876    When we get to 'd' we are still matching the first word, we would encounter
2877    'g' which would fail, which would bring us to the state representing 'd' in
2878    the second word where we would try 'g' and succeed, proceeding to match
2879    'cdgu'.
2880  */
2881  /* add a fail transition */
2882     const U32 trie_offset = ARG(source);
2883     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2884     U32 *q;
2885     const U32 ucharcount = trie->uniquecharcount;
2886     const U32 numstates = trie->statecount;
2887     const U32 ubound = trie->lasttrans + ucharcount;
2888     U32 q_read = 0;
2889     U32 q_write = 0;
2890     U32 charid;
2891     U32 base = trie->states[ 1 ].trans.base;
2892     U32 *fail;
2893     reg_ac_data *aho;
2894     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2895     GET_RE_DEBUG_FLAGS_DECL;
2896
2897     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2898 #ifndef DEBUGGING
2899     PERL_UNUSED_ARG(depth);
2900 #endif
2901
2902
2903     ARG_SET( stclass, data_slot );
2904     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2905     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2906     aho->trie=trie_offset;
2907     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2908     Copy( trie->states, aho->states, numstates, reg_trie_state );
2909     Newxz( q, numstates, U32);
2910     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2911     aho->refcount = 1;
2912     fail = aho->fail;
2913     /* initialize fail[0..1] to be 1 so that we always have
2914        a valid final fail state */
2915     fail[ 0 ] = fail[ 1 ] = 1;
2916
2917     for ( charid = 0; charid < ucharcount ; charid++ ) {
2918         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2919         if ( newstate ) {
2920             q[ q_write ] = newstate;
2921             /* set to point at the root */
2922             fail[ q[ q_write++ ] ]=1;
2923         }
2924     }
2925     while ( q_read < q_write) {
2926         const U32 cur = q[ q_read++ % numstates ];
2927         base = trie->states[ cur ].trans.base;
2928
2929         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2930             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2931             if (ch_state) {
2932                 U32 fail_state = cur;
2933                 U32 fail_base;
2934                 do {
2935                     fail_state = fail[ fail_state ];
2936                     fail_base = aho->states[ fail_state ].trans.base;
2937                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2938
2939                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2940                 fail[ ch_state ] = fail_state;
2941                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2942                 {
2943                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2944                 }
2945                 q[ q_write++ % numstates] = ch_state;
2946             }
2947         }
2948     }
2949     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2950        when we fail in state 1, this allows us to use the
2951        charclass scan to find a valid start char. This is based on the principle
2952        that theres a good chance the string being searched contains lots of stuff
2953        that cant be a start char.
2954      */
2955     fail[ 0 ] = fail[ 1 ] = 0;
2956     DEBUG_TRIE_COMPILE_r({
2957         PerlIO_printf(Perl_debug_log,
2958                       "%*sStclass Failtable (%"UVuf" states): 0",
2959                       (int)(depth * 2), "", (UV)numstates
2960         );
2961         for( q_read=1; q_read<numstates; q_read++ ) {
2962             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2963         }
2964         PerlIO_printf(Perl_debug_log, "\n");
2965     });
2966     Safefree(q);
2967     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2968 }
2969
2970
2971 #define DEBUG_PEEP(str,scan,depth) \
2972     DEBUG_OPTIMISE_r({if (scan){ \
2973        SV * const mysv=sv_newmortal(); \
2974        regnode *Next = regnext(scan); \
2975        regprop(RExC_rx, mysv, scan); \
2976        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2977        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2978        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2979    }});
2980
2981
2982 /* The below joins as many adjacent EXACTish nodes as possible into a single
2983  * one.  The regop may be changed if the node(s) contain certain sequences that
2984  * require special handling.  The joining is only done if:
2985  * 1) there is room in the current conglomerated node to entirely contain the
2986  *    next one.
2987  * 2) they are the exact same node type
2988  *
2989  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2990  * these get optimized out
2991  *
2992  * If a node is to match under /i (folded), the number of characters it matches
2993  * can be different than its character length if it contains a multi-character
2994  * fold.  *min_subtract is set to the total delta of the input nodes.
2995  *
2996  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2997  * and contains LATIN SMALL LETTER SHARP S
2998  *
2999  * This is as good a place as any to discuss the design of handling these
3000  * multi-character fold sequences.  It's been wrong in Perl for a very long
3001  * time.  There are three code points in Unicode whose multi-character folds
3002  * were long ago discovered to mess things up.  The previous designs for
3003  * dealing with these involved assigning a special node for them.  This
3004  * approach doesn't work, as evidenced by this example:
3005  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3006  * Both these fold to "sss", but if the pattern is parsed to create a node that
3007  * would match just the \xDF, it won't be able to handle the case where a
3008  * successful match would have to cross the node's boundary.  The new approach
3009  * that hopefully generally solves the problem generates an EXACTFU_SS node
3010  * that is "sss".
3011  *
3012  * It turns out that there are problems with all multi-character folds, and not
3013  * just these three.  Now the code is general, for all such cases.  The
3014  * approach taken is:
3015  * 1)   This routine examines each EXACTFish node that could contain multi-
3016  *      character fold sequences.  It returns in *min_subtract how much to
3017  *      subtract from the the actual length of the string to get a real minimum
3018  *      match length; it is 0 if there are no multi-char folds.  This delta is
3019  *      used by the caller to adjust the min length of the match, and the delta
3020  *      between min and max, so that the optimizer doesn't reject these
3021  *      possibilities based on size constraints.
3022  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3023  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3024  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3025  *      there is a possible fold length change.  That means that a regular
3026  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3027  *      with length changes, and so can be processed faster.  regexec.c takes
3028  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3029  *      pre-folded by regcomp.c.  This saves effort in regex matching.
3030  *      However, the pre-folding isn't done for non-UTF8 patterns because the
3031  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3032  *      down by forcing the pattern into UTF8 unless necessary.  Also what
3033  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
3034  *      possibilities for the non-UTF8 patterns are quite simple, except for
3035  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3036  *      members of a fold-pair, and arrays are set up for all of them so that
3037  *      the other member of the pair can be found quickly.  Code elsewhere in
3038  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3039  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3040  *      described in the next item.
3041  * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3042  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3043  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
3044  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
3045  *      character in the pattern corresponds to at most a single character in
3046  *      the target string.  (And I do mean character, and not byte here, unlike
3047  *      other parts of the documentation that have never been updated to
3048  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
3049  *      two character string 'ss'; in EXACTFA nodes it can match
3050  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
3051  *      instances where it is violated.  I'm reluctant to try to change the
3052  *      assumption, as the code involved is impenetrable to me (khw), so
3053  *      instead the code here punts.  This routine examines (when the pattern
3054  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3055  *      boolean indicating whether or not the node contains a sharp s.  When it
3056  *      is true, the caller sets a flag that later causes the optimizer in this
3057  *      file to not set values for the floating and fixed string lengths, and
3058  *      thus avoids the optimizer code in regexec.c that makes the invalid
3059  *      assumption.  Thus, there is no optimization based on string lengths for
3060  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3061  *      (The reason the assumption is wrong only in these two cases is that all
3062  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3063  *      other folds to their expanded versions.  We can't prefold sharp s to
3064  *      'ss' in EXACTF nodes because we don't know at compile time if it
3065  *      actually matches 'ss' or not.  It will match iff the target string is
3066  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3067  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
3068  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3069  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3070  *      require the pattern to be forced into UTF-8, the overhead of which we
3071  *      want to avoid.)
3072  *
3073  *      Similarly, the code that generates tries doesn't currently handle
3074  *      not-already-folded multi-char folds, and it looks like a pain to change
3075  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3076  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3077  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3078  *      using /iaa matching will be doing so almost entirely with ASCII
3079  *      strings, so this should rarely be encountered in practice */
3080
3081 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3082     if (PL_regkind[OP(scan)] == EXACT) \
3083         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3084
3085 STATIC U32
3086 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3087     /* Merge several consecutive EXACTish nodes into one. */
3088     regnode *n = regnext(scan);
3089     U32 stringok = 1;
3090     regnode *next = scan + NODE_SZ_STR(scan);
3091     U32 merged = 0;
3092     U32 stopnow = 0;
3093 #ifdef DEBUGGING
3094     regnode *stop = scan;
3095     GET_RE_DEBUG_FLAGS_DECL;
3096 #else
3097     PERL_UNUSED_ARG(depth);
3098 #endif
3099
3100     PERL_ARGS_ASSERT_JOIN_EXACT;
3101 #ifndef EXPERIMENTAL_INPLACESCAN
3102     PERL_UNUSED_ARG(flags);
3103     PERL_UNUSED_ARG(val);
3104 #endif
3105     DEBUG_PEEP("join",scan,depth);
3106
3107     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3108      * EXACT ones that are mergeable to the current one. */
3109     while (n
3110            && (PL_regkind[OP(n)] == NOTHING
3111                || (stringok && OP(n) == OP(scan)))
3112            && NEXT_OFF(n)
3113            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3114     {
3115
3116         if (OP(n) == TAIL || n > next)
3117             stringok = 0;
3118         if (PL_regkind[OP(n)] == NOTHING) {
3119             DEBUG_PEEP("skip:",n,depth);
3120             NEXT_OFF(scan) += NEXT_OFF(n);
3121             next = n + NODE_STEP_REGNODE;
3122 #ifdef DEBUGGING
3123             if (stringok)
3124                 stop = n;
3125 #endif
3126             n = regnext(n);
3127         }
3128         else if (stringok) {
3129             const unsigned int oldl = STR_LEN(scan);
3130             regnode * const nnext = regnext(n);
3131
3132             /* XXX I (khw) kind of doubt that this works on platforms where
3133              * U8_MAX is above 255 because of lots of other assumptions */
3134             /* Don't join if the sum can't fit into a single node */
3135             if (oldl + STR_LEN(n) > U8_MAX)
3136                 break;
3137
3138             DEBUG_PEEP("merg",n,depth);
3139             merged++;
3140
3141             NEXT_OFF(scan) += NEXT_OFF(n);
3142             STR_LEN(scan) += STR_LEN(n);
3143             next = n + NODE_SZ_STR(n);
3144             /* Now we can overwrite *n : */
3145             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3146 #ifdef DEBUGGING
3147             stop = next - 1;
3148 #endif
3149             n = nnext;
3150             if (stopnow) break;
3151         }
3152
3153 #ifdef EXPERIMENTAL_INPLACESCAN
3154         if (flags && !NEXT_OFF(n)) {
3155             DEBUG_PEEP("atch", val, depth);
3156             if (reg_off_by_arg[OP(n)]) {
3157                 ARG_SET(n, val - n);
3158             }
3159             else {
3160                 NEXT_OFF(n) = val - n;
3161             }
3162             stopnow = 1;
3163         }
3164 #endif
3165     }
3166
3167     *min_subtract = 0;
3168     *has_exactf_sharp_s = FALSE;
3169
3170     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3171      * can now analyze for sequences of problematic code points.  (Prior to
3172      * this final joining, sequences could have been split over boundaries, and
3173      * hence missed).  The sequences only happen in folding, hence for any
3174      * non-EXACT EXACTish node */
3175     if (OP(scan) != EXACT) {
3176         const U8 * const s0 = (U8*) STRING(scan);
3177         const U8 * s = s0;
3178         const U8 * const s_end = s0 + STR_LEN(scan);
3179
3180         /* One pass is made over the node's string looking for all the
3181          * possibilities.  to avoid some tests in the loop, there are two main
3182          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3183          * non-UTF-8 */
3184         if (UTF) {
3185
3186             /* Examine the string for a multi-character fold sequence.  UTF-8
3187              * patterns have all characters pre-folded by the time this code is
3188              * executed */
3189             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3190                                      length sequence we are looking for is 2 */
3191             {
3192                 int count = 0;
3193                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3194                 if (! len) {    /* Not a multi-char fold: get next char */
3195                     s += UTF8SKIP(s);
3196                     continue;
3197                 }
3198
3199                 /* Nodes with 'ss' require special handling, except for EXACTFL
3200                  * and EXACTFA-ish for which there is no multi-char fold to
3201                  * this */
3202                 if (len == 2 && *s == 's' && *(s+1) == 's'
3203                     && OP(scan) != EXACTFL
3204                     && OP(scan) != EXACTFA
3205                     && OP(scan) != EXACTFA_NO_TRIE)
3206                 {
3207                     count = 2;
3208                     OP(scan) = EXACTFU_SS;
3209                     s += 2;
3210                 }
3211                 else { /* Here is a generic multi-char fold. */
3212                     const U8* multi_end  = s + len;
3213
3214                     /* Count how many characters in it.  In the case of /l and
3215                      * /aa, no folds which contain ASCII code points are
3216                      * allowed, so check for those, and skip if found.  (In
3217                      * EXACTFL, no folds are allowed to any Latin1 code point,
3218                      * not just ASCII.  But there aren't any of these
3219                      * currently, nor ever likely, so don't take the time to
3220                      * test for them.  The code that generates the
3221                      * is_MULTI_foo() macros croaks should one actually get put
3222                      * into Unicode .) */
3223                     if (OP(scan) != EXACTFL
3224                         && OP(scan) != EXACTFA
3225                         && OP(scan) != EXACTFA_NO_TRIE)
3226                     {
3227                         count = utf8_length(s, multi_end);
3228                         s = multi_end;
3229                     }
3230                     else {
3231                         while (s < multi_end) {
3232                             if (isASCII(*s)) {
3233                                 s++;
3234                                 goto next_iteration;
3235                             }
3236                             else {
3237                                 s += UTF8SKIP(s);
3238                             }
3239                             count++;
3240                         }
3241                     }
3242                 }
3243
3244                 /* The delta is how long the sequence is minus 1 (1 is how long
3245                  * the character that folds to the sequence is) */
3246                 *min_subtract += count - 1;
3247             next_iteration: ;
3248             }
3249         }
3250         else if (OP(scan) == EXACTFA) {
3251
3252             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3253              * fold to the ASCII range (and there are no existing ones in the
3254              * upper latin1 range).  But, as outlined in the comments preceding
3255              * this function, we need to flag any occurrences of the sharp s.
3256              * This character forbids trie formation (because of added
3257              * complexity) */
3258             while (s < s_end) {
3259                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3260                     OP(scan) = EXACTFA_NO_TRIE;
3261                     *has_exactf_sharp_s = TRUE;
3262                     break;
3263                 }
3264                 s++;
3265                 continue;
3266             }
3267         }
3268         else if (OP(scan) != EXACTFL) {
3269
3270             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
3271              * multi-char folds that are all Latin1.  (This code knows that
3272              * there are no current multi-char folds possible with EXACTFL,
3273              * relying on fold_grind.t to catch any errors if the very unlikely
3274              * event happens that some get added in future Unicode versions.)
3275              * As explained in the comments preceding this function, we look
3276              * also for the sharp s in EXACTF nodes; it can be in the final
3277              * position.  Otherwise we can stop looking 1 byte earlier because
3278              * have to find at least two characters for a multi-fold */
3279             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3280
3281             while (s < upper) {
3282                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3283                 if (! len) {    /* Not a multi-char fold. */
3284                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3285                     {
3286                         *has_exactf_sharp_s = TRUE;
3287                     }
3288                     s++;
3289                     continue;
3290                 }
3291
3292                 if (len == 2
3293                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3294                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3295                 {
3296
3297                     /* EXACTF nodes need to know that the minimum length
3298                      * changed so that a sharp s in the string can match this
3299                      * ss in the pattern, but they remain EXACTF nodes, as they
3300                      * won't match this unless the target string is is UTF-8,
3301                      * which we don't know until runtime */
3302                     if (OP(scan) != EXACTF) {
3303                         OP(scan) = EXACTFU_SS;
3304                     }
3305                 }
3306
3307                 *min_subtract += len - 1;
3308                 s += len;
3309             }
3310         }
3311     }
3312
3313 #ifdef DEBUGGING
3314     /* Allow dumping but overwriting the collection of skipped
3315      * ops and/or strings with fake optimized ops */
3316     n = scan + NODE_SZ_STR(scan);
3317     while (n <= stop) {
3318         OP(n) = OPTIMIZED;
3319         FLAGS(n) = 0;
3320         NEXT_OFF(n) = 0;
3321         n++;
3322     }
3323 #endif
3324     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3325     return stopnow;
3326 }
3327
3328 /* REx optimizer.  Converts nodes into quicker variants "in place".
3329    Finds fixed substrings.  */
3330
3331 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3332    to the position after last scanned or to NULL. */
3333
3334 #define INIT_AND_WITHP \
3335     assert(!and_withp); \
3336     Newx(and_withp,1, regnode_ssc); \
3337     SAVEFREEPV(and_withp)
3338
3339 /* this is a chain of data about sub patterns we are processing that
3340    need to be handled separately/specially in study_chunk. Its so
3341    we can simulate recursion without losing state.  */
3342 struct scan_frame;
3343 typedef struct scan_frame {
3344     regnode *last;  /* last node to process in this frame */
3345     regnode *next;  /* next node to process when last is reached */
3346     struct scan_frame *prev; /*previous frame*/
3347     U32 prev_recursed_depth;
3348     I32 stop; /* what stopparen do we use */
3349 } scan_frame;
3350
3351
3352 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3353
3354 STATIC SSize_t
3355 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3356                         SSize_t *minlenp, SSize_t *deltap,
3357                         regnode *last,
3358                         scan_data_t *data,
3359                         I32 stopparen,
3360                         U32 recursed_depth,
3361                         regnode_ssc *and_withp,
3362                         U32 flags, U32 depth)
3363                         /* scanp: Start here (read-write). */
3364                         /* deltap: Write maxlen-minlen here. */
3365                         /* last: Stop before this one. */
3366                         /* data: string data about the pattern */
3367                         /* stopparen: treat close N as END */
3368                         /* recursed: which subroutines have we recursed into */
3369                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3370 {
3371     dVAR;
3372     /* There must be at least this number of characters to match */
3373     SSize_t min = 0;
3374     I32 pars = 0, code;
3375     regnode *scan = *scanp, *next;
3376     SSize_t delta = 0;
3377     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3378     int is_inf_internal = 0;            /* The studied chunk is infinite */
3379     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3380     scan_data_t data_fake;
3381     SV *re_trie_maxbuff = NULL;
3382     regnode *first_non_open = scan;
3383     SSize_t stopmin = SSize_t_MAX;
3384     scan_frame *frame = NULL;
3385     GET_RE_DEBUG_FLAGS_DECL;
3386
3387     PERL_ARGS_ASSERT_STUDY_CHUNK;
3388
3389 #ifdef DEBUGGING
3390     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3391 #endif
3392     if ( depth == 0 ) {
3393         while (first_non_open && OP(first_non_open) == OPEN)
3394             first_non_open=regnext(first_non_open);
3395     }
3396
3397
3398   fake_study_recurse:
3399     while ( scan && OP(scan) != END && scan < last ){
3400         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3401                                    node length to get a real minimum (because
3402                                    the folded version may be shorter) */
3403         bool has_exactf_sharp_s = FALSE;
3404         /* Peephole optimizer: */
3405         DEBUG_OPTIMISE_MORE_r(
3406         {
3407             PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%d depth=%u recursed_depth=%u ",
3408                 (depth*2), "", stopparen, depth, recursed_depth);
3409             if (recursed_depth) {
3410                 U32 i;
3411                 U32 j;
3412                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3413                     PerlIO_printf(Perl_debug_log,"[");
3414                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3415                         PerlIO_printf(Perl_debug_log,"%d",
3416                             PAREN_TEST(RExC_study_chunk_recursed +
3417                                        (j * RExC_study_chunk_recursed_bytes), i)
3418                             ? 1 : 0
3419                         );
3420                     PerlIO_printf(Perl_debug_log,"]");
3421                 }
3422             }
3423             PerlIO_printf(Perl_debug_log,"\n");
3424         }
3425         );
3426         DEBUG_STUDYDATA("Peep:", data, depth);
3427         DEBUG_PEEP("Peep", scan, depth);
3428
3429
3430         /* Its not clear to khw or hv why this is done here, and not in the
3431          * clauses that deal with EXACT nodes.  khw's guess is that it's
3432          * because of a previous design */
3433         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3434
3435         /* Follow the next-chain of the current node and optimize
3436            away all the NOTHINGs from it.  */
3437         if (OP(scan) != CURLYX) {
3438             const int max = (reg_off_by_arg[OP(scan)]
3439                        ? I32_MAX
3440                        /* I32 may be smaller than U16 on CRAYs! */
3441                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3442             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3443             int noff;
3444             regnode *n = scan;
3445
3446             /* Skip NOTHING and LONGJMP. */
3447             while ((n = regnext(n))
3448                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3449                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3450                    && off + noff < max)
3451                 off += noff;
3452             if (reg_off_by_arg[OP(scan)])
3453                 ARG(scan) = off;
3454             else
3455                 NEXT_OFF(scan) = off;
3456         }
3457
3458
3459
3460         /* The principal pseudo-switch.  Cannot be a switch, since we
3461            look into several different things.  */
3462         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3463                    || OP(scan) == IFTHEN) {
3464             next = regnext(scan);
3465             code = OP(scan);
3466             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3467
3468             if (OP(next) == code || code == IFTHEN) {
3469                 /* NOTE - There is similar code to this block below for
3470                  * handling TRIE nodes on a re-study.  If you change stuff here
3471                  * check there too. */
3472                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3473                 regnode_ssc accum;
3474                 regnode * const startbranch=scan;
3475
3476                 if (flags & SCF_DO_SUBSTR)
3477                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3478                 if (flags & SCF_DO_STCLASS)
3479                     ssc_init_zero(pRExC_state, &accum);
3480
3481                 while (OP(scan) == code) {
3482                     SSize_t deltanext, minnext, fake;
3483                     I32 f = 0;
3484                     regnode_ssc this_class;
3485
3486                     num++;
3487                     data_fake.flags = 0;
3488                     if (data) {
3489                         data_fake.whilem_c = data->whilem_c;
3490                         data_fake.last_closep = data->last_closep;
3491                     }
3492                     else
3493                         data_fake.last_closep = &fake;
3494
3495                     data_fake.pos_delta = delta;
3496                     next = regnext(scan);
3497                     scan = NEXTOPER(scan);
3498                     if (code != BRANCH)
3499                         scan = NEXTOPER(scan);
3500                     if (flags & SCF_DO_STCLASS) {
3501                         ssc_init(pRExC_state, &this_class);
3502                         data_fake.start_class = &this_class;
3503                         f = SCF_DO_STCLASS_AND;
3504                     }
3505                     if (flags & SCF_WHILEM_VISITED_POS)
3506                         f |= SCF_WHILEM_VISITED_POS;
3507
3508                     /* we suppose the run is continuous, last=next...*/
3509                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3510                                           next, &data_fake,
3511                                           stopparen, recursed_depth, NULL, f,depth+1);
3512                     if (min1 > minnext)
3513                         min1 = minnext;
3514                     if (deltanext == SSize_t_MAX) {
3515                         is_inf = is_inf_internal = 1;
3516                         max1 = SSize_t_MAX;
3517                     } else if (max1 < minnext + deltanext)
3518                         max1 = minnext + deltanext;
3519                     scan = next;
3520                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3521                         pars++;
3522                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3523                         if ( stopmin > minnext)
3524                             stopmin = min + min1;
3525                         flags &= ~SCF_DO_SUBSTR;
3526                         if (data)
3527                             data->flags |= SCF_SEEN_ACCEPT;
3528                     }
3529                     if (data) {
3530                         if (data_fake.flags & SF_HAS_EVAL)
3531                             data->flags |= SF_HAS_EVAL;
3532                         data->whilem_c = data_fake.whilem_c;
3533                     }
3534                     if (flags & SCF_DO_STCLASS)
3535                         ssc_or(pRExC_state, &accum, &this_class);
3536                 }
3537                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3538                     min1 = 0;
3539                 if (flags & SCF_DO_SUBSTR) {
3540                     data->pos_min += min1;
3541                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3542                         data->pos_delta = SSize_t_MAX;
3543                     else
3544                         data->pos_delta += max1 - min1;
3545                     if (max1 != min1 || is_inf)
3546                         data->longest = &(data->longest_float);
3547                 }
3548                 min += min1;
3549                 if (delta == SSize_t_MAX
3550                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3551                     delta = SSize_t_MAX;
3552                 else
3553                     delta += max1 - min1;
3554                 if (flags & SCF_DO_STCLASS_OR) {
3555                     ssc_or(pRExC_state, data->start_class, &accum);
3556                     if (min1) {
3557                         ssc_and(pRExC_state, data->start_class, and_withp);
3558                         flags &= ~SCF_DO_STCLASS;
3559                     }
3560                 }
3561                 else if (flags & SCF_DO_STCLASS_AND) {
3562                     if (min1) {
3563                         ssc_and(pRExC_state, data->start_class, &accum);
3564                         flags &= ~SCF_DO_STCLASS;
3565                     }
3566                     else {
3567                         /* Switch to OR mode: cache the old value of
3568                          * data->start_class */
3569                         INIT_AND_WITHP;
3570                         StructCopy(data->start_class, and_withp, regnode_ssc);
3571                         flags &= ~SCF_DO_STCLASS_AND;
3572                         StructCopy(&accum, data->start_class, regnode_ssc);
3573                         flags |= SCF_DO_STCLASS_OR;
3574                     }
3575                 }
3576
3577                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3578                 /* demq.
3579
3580                    Assuming this was/is a branch we are dealing with: 'scan'
3581                    now points at the item that follows the branch sequence,
3582                    whatever it is. We now start at the beginning of the
3583                    sequence and look for subsequences of
3584
3585                    BRANCH->EXACT=>x1
3586                    BRANCH->EXACT=>x2
3587                    tail
3588
3589                    which would be constructed from a pattern like
3590                    /A|LIST|OF|WORDS/
3591
3592                    If we can find such a subsequence we need to turn the first
3593                    element into a trie and then add the subsequent branch exact
3594                    strings to the trie.
3595
3596                    We have two cases
3597
3598                      1. patterns where the whole set of branches can be
3599                         converted.
3600
3601                      2. patterns where only a subset can be converted.
3602
3603                    In case 1 we can replace the whole set with a single regop
3604                    for the trie. In case 2 we need to keep the start and end
3605                    branches so
3606
3607                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3608                      becomes BRANCH TRIE; BRANCH X;
3609
3610                   There is an additional case, that being where there is a
3611                   common prefix, which gets split out into an EXACT like node
3612                   preceding the TRIE node.
3613
3614                   If x(1..n)==tail then we can do a simple trie, if not we make
3615                   a "jump" trie, such that when we match the appropriate word
3616                   we "jump" to the appropriate tail node. Essentially we turn
3617                   a nested if into a case structure of sorts.
3618
3619                 */
3620
3621                     int made=0;
3622                     if (!re_trie_maxbuff) {
3623                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3624                         if (!SvIOK(re_trie_maxbuff))
3625                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3626                     }
3627                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3628                         regnode *cur;
3629                         regnode *first = (regnode *)NULL;
3630                         regnode *last = (regnode *)NULL;
3631                         regnode *tail = scan;
3632                         U8 trietype = 0;
3633                         U32 count=0;
3634
3635 #ifdef DEBUGGING
3636                         SV * const mysv = sv_newmortal();       /* for dumping */
3637 #endif
3638                         /* var tail is used because there may be a TAIL
3639                            regop in the way. Ie, the exacts will point to the
3640                            thing following the TAIL, but the last branch will
3641                            point at the TAIL. So we advance tail. If we
3642                            have nested (?:) we may have to move through several
3643                            tails.
3644                          */
3645
3646                         while ( OP( tail ) == TAIL ) {
3647                             /* this is the TAIL generated by (?:) */
3648                             tail = regnext( tail );
3649                         }
3650
3651
3652                         DEBUG_TRIE_COMPILE_r({
3653                             regprop(RExC_rx, mysv, tail );
3654                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3655                                 (int)depth * 2 + 2, "",
3656                                 "Looking for TRIE'able sequences. Tail node is: ",
3657                                 SvPV_nolen_const( mysv )
3658                             );
3659                         });
3660
3661                         /*
3662
3663                             Step through the branches
3664                                 cur represents each branch,
3665                                 noper is the first thing to be matched as part
3666                                       of that branch
3667                                 noper_next is the regnext() of that node.
3668
3669                             We normally handle a case like this
3670                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3671                             support building with NOJUMPTRIE, which restricts
3672                             the trie logic to structures like /FOO|BAR/.
3673
3674                             If noper is a trieable nodetype then the branch is
3675                             a possible optimization target. If we are building
3676                             under NOJUMPTRIE then we require that noper_next is
3677                             the same as scan (our current position in the regex
3678                             program).
3679
3680                             Once we have two or more consecutive such branches
3681                             we can create a trie of the EXACT's contents and
3682                             stitch it in place into the program.
3683
3684                             If the sequence represents all of the branches in
3685                             the alternation we replace the entire thing with a
3686                             single TRIE node.
3687
3688                             Otherwise when it is a subsequence we need to
3689                             stitch it in place and replace only the relevant
3690                             branches. This means the first branch has to remain
3691                             as it is used by the alternation logic, and its
3692                             next pointer, and needs to be repointed at the item
3693                             on the branch chain following the last branch we
3694                             have optimized away.
3695
3696                             This could be either a BRANCH, in which case the
3697                             subsequence is internal, or it could be the item
3698                             following the branch sequence in which case the
3699                             subsequence is at the end (which does not
3700                             necessarily mean the first node is the start of the
3701                             alternation).
3702
3703                             TRIE_TYPE(X) is a define which maps the optype to a
3704                             trietype.
3705
3706                                 optype          |  trietype
3707                                 ----------------+-----------
3708                                 NOTHING         | NOTHING
3709                                 EXACT           | EXACT
3710                                 EXACTFU         | EXACTFU
3711                                 EXACTFU_SS      | EXACTFU
3712                                 EXACTFA         | EXACTFA
3713
3714
3715                         */
3716 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3717                        ( EXACT == (X) )   ? EXACT :        \
3718                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3719                        ( EXACTFA == (X) ) ? EXACTFA :        \
3720                        0 )
3721
3722                         /* dont use tail as the end marker for this traverse */
3723                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3724                             regnode * const noper = NEXTOPER( cur );
3725                             U8 noper_type = OP( noper );
3726                             U8 noper_trietype = TRIE_TYPE( noper_type );
3727 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3728                             regnode * const noper_next = regnext( noper );
3729                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3730                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3731 #endif
3732
3733                             DEBUG_TRIE_COMPILE_r({
3734                                 regprop(RExC_rx, mysv, cur);
3735                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3736                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3737
3738                                 regprop(RExC_rx, mysv, noper);
3739                                 PerlIO_printf( Perl_debug_log, " -> %s",
3740                                     SvPV_nolen_const(mysv));
3741
3742                                 if ( noper_next ) {
3743                                   regprop(RExC_rx, mysv, noper_next );
3744                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3745                                     SvPV_nolen_const(mysv));
3746                                 }
3747                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3748                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3749                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3750                                 );
3751                             });
3752
3753                             /* Is noper a trieable nodetype that can be merged
3754                              * with the current trie (if there is one)? */
3755                             if ( noper_trietype
3756                                   &&
3757                                   (
3758                                         ( noper_trietype == NOTHING)
3759                                         || ( trietype == NOTHING )
3760                                         || ( trietype == noper_trietype )
3761                                   )
3762 #ifdef NOJUMPTRIE
3763                                   && noper_next == tail
3764 #endif
3765                                   && count < U16_MAX)
3766                             {
3767                                 /* Handle mergable triable node Either we are
3768                                  * the first node in a new trieable sequence,
3769                                  * in which case we do some bookkeeping,
3770                                  * otherwise we update the end pointer. */
3771                                 if ( !first ) {
3772                                     first = cur;
3773                                     if ( noper_trietype == NOTHING ) {
3774 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3775                                         regnode * const noper_next = regnext( noper );
3776                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3777                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3778 #endif
3779
3780                                         if ( noper_next_trietype ) {
3781                                             trietype = noper_next_trietype;
3782                                         } else if (noper_next_type)  {
3783                                             /* a NOTHING regop is 1 regop wide.
3784                                              * We need at least two for a trie
3785                                              * so we can't merge this in */
3786                                             first = NULL;
3787                                         }
3788                                     } else {
3789                                         trietype = noper_trietype;
3790                                     }
3791                                 } else {
3792                                     if ( trietype == NOTHING )
3793                                         trietype = noper_trietype;
3794                                     last = cur;
3795                                 }
3796                                 if (first)
3797                                     count++;
3798                             } /* end handle mergable triable node */
3799                             else {
3800                                 /* handle unmergable node -
3801                                  * noper may either be a triable node which can
3802                                  * not be tried together with the current trie,
3803                                  * or a non triable node */
3804                                 if ( last ) {
3805                                     /* If last is set and trietype is not
3806                                      * NOTHING then we have found at least two
3807                                      * triable branch sequences in a row of a
3808                                      * similar trietype so we can turn them
3809                                      * into a trie. If/when we allow NOTHING to
3810                                      * start a trie sequence this condition
3811                                      * will be required, and it isn't expensive
3812                                      * so we leave it in for now. */
3813                                     if ( trietype && trietype != NOTHING )
3814                                         make_trie( pRExC_state,
3815                                                 startbranch, first, cur, tail, count,
3816                                                 trietype, depth+1 );
3817                                     last = NULL; /* note: we clear/update
3818                                                     first, trietype etc below,
3819                                                     so we dont do it here */
3820                                 }
3821                                 if ( noper_trietype
3822 #ifdef NOJUMPTRIE
3823                                      && noper_next == tail
3824 #endif
3825                                 ){
3826                                     /* noper is triable, so we can start a new
3827                                      * trie sequence */
3828                                     count = 1;
3829                                     first = cur;
3830                                     trietype = noper_trietype;
3831                                 } else if (first) {
3832                                     /* if we already saw a first but the
3833                                      * current node is not triable then we have
3834                                      * to reset the first information. */
3835                                     count = 0;
3836                                     first = NULL;
3837                                     trietype = 0;
3838                                 }
3839                             } /* end handle unmergable node */
3840                         } /* loop over branches */
3841                         DEBUG_TRIE_COMPILE_r({
3842                             regprop(RExC_rx, mysv, cur);
3843                             PerlIO_printf( Perl_debug_log,
3844                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3845                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3846
3847                         });
3848                         if ( last && trietype ) {
3849                             if ( trietype != NOTHING ) {
3850                                 /* the last branch of the sequence was part of
3851                                  * a trie, so we have to construct it here
3852                                  * outside of the loop */
3853                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3854 #ifdef TRIE_STUDY_OPT
3855                                 if ( ((made == MADE_EXACT_TRIE &&
3856                                      startbranch == first)
3857                                      || ( first_non_open == first )) &&
3858                                      depth==0 ) {
3859                                     flags |= SCF_TRIE_RESTUDY;
3860                                     if ( startbranch == first
3861                                          && scan == tail )
3862                                     {
3863                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3864                                     }
3865                                 }
3866 #endif
3867                             } else {
3868                                 /* at this point we know whatever we have is a
3869                                  * NOTHING sequence/branch AND if 'startbranch'
3870                                  * is 'first' then we can turn the whole thing
3871                                  * into a NOTHING
3872                                  */
3873                                 if ( startbranch == first ) {
3874                                     regnode *opt;
3875                                     /* the entire thing is a NOTHING sequence,
3876                                      * something like this: (?:|) So we can
3877                                      * turn it into a plain NOTHING op. */
3878                                     DEBUG_TRIE_COMPILE_r({
3879                                         regprop(RExC_rx, mysv, cur);
3880                                         PerlIO_printf( Perl_debug_log,
3881                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3882                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3883
3884                                     });
3885                                     OP(startbranch)= NOTHING;
3886                                     NEXT_OFF(startbranch)= tail - startbranch;
3887                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3888                                         OP(opt)= OPTIMIZED;
3889                                 }
3890                             }
3891                         } /* end if ( last) */
3892                     } /* TRIE_MAXBUF is non zero */
3893
3894                 } /* do trie */
3895
3896             }
3897             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3898                 scan = NEXTOPER(NEXTOPER(scan));
3899             } else                      /* single branch is optimized. */
3900                 scan = NEXTOPER(scan);
3901             continue;
3902         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3903             scan_frame *newframe = NULL;
3904             I32 paren;
3905             regnode *start;
3906             regnode *end;
3907             U32 my_recursed_depth= recursed_depth;
3908
3909             if (OP(scan) != SUSPEND) {
3910                 /* set the pointer */
3911                 if (OP(scan) == GOSUB) {
3912                     paren = ARG(scan);
3913                     RExC_recurse[ARG2L(scan)] = scan;
3914                     start = RExC_open_parens[paren-1];
3915                     end   = RExC_close_parens[paren-1];
3916                 } else {
3917                     paren = 0;
3918                     start = RExC_rxi->program + 1;
3919                     end   = RExC_opend;
3920                 }
3921                 if (!recursed_depth
3922                     ||
3923                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3924                 ) {
3925                     if (!recursed_depth) {
3926                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3927                     } else {
3928                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3929                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3930                              RExC_study_chunk_recursed_bytes, U8);
3931                     }
3932                     /* we havent recursed into this paren yet, so recurse into it */
3933                     DEBUG_STUDYDATA("set:", data,depth);
3934                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3935                     my_recursed_depth= recursed_depth + 1;
3936                     Newx(newframe,1,scan_frame);
3937                 } else {
3938                     DEBUG_STUDYDATA("inf:", data,depth);
3939                     /* some form of infinite recursion, assume infinite length */
3940                     if (flags & SCF_DO_SUBSTR) {
3941                         SCAN_COMMIT(pRExC_state,data,minlenp);
3942                         data->longest = &(data->longest_float);
3943                     }
3944                     is_inf = is_inf_internal = 1;
3945                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3946                         ssc_anything(data->start_class);
3947                     flags &= ~SCF_DO_STCLASS;
3948                 }
3949             } else {
3950                 Newx(newframe,1,scan_frame);
3951                 paren = stopparen;
3952                 start = scan+2;
3953                 end = regnext(scan);
3954             }
3955             if (newframe) {
3956                 assert(start);
3957                 assert(end);
3958                 SAVEFREEPV(newframe);
3959                 newframe->next = regnext(scan);
3960                 newframe->last = last;
3961                 newframe->stop = stopparen;
3962                 newframe->prev = frame;
3963                 newframe->prev_recursed_depth = recursed_depth;
3964
3965                 DEBUG_STUDYDATA("frame-new:",data,depth);
3966                 DEBUG_PEEP("fnew", scan, depth);
3967
3968                 frame = newframe;
3969                 scan =  start;
3970                 stopparen = paren;
3971                 last = end;
3972                 depth = depth + 1;
3973                 recursed_depth= my_recursed_depth;
3974
3975                 continue;
3976             }
3977         }
3978         else if (OP(scan) == EXACT) {
3979             SSize_t l = STR_LEN(scan);
3980             UV uc;
3981             if (UTF) {
3982                 const U8 * const s = (U8*)STRING(scan);
3983                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3984                 l = utf8_length(s, s + l);
3985             } else {
3986                 uc = *((U8*)STRING(scan));
3987             }
3988             min += l;
3989             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3990                 /* The code below prefers earlier match for fixed
3991                    offset, later match for variable offset.  */
3992                 if (data->last_end == -1) { /* Update the start info. */
3993                     data->last_start_min = data->pos_min;
3994                     data->last_start_max = is_inf
3995                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3996                 }
3997                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3998                 if (UTF)
3999                     SvUTF8_on(data->last_found);
4000                 {
4001                     SV * const sv = data->last_found;
4002                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4003                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4004                     if (mg && mg->mg_len >= 0)
4005                         mg->mg_len += utf8_length((U8*)STRING(scan),
4006                                                   (U8*)STRING(scan)+STR_LEN(scan));
4007                 }
4008                 data->last_end = data->pos_min + l;
4009                 data->pos_min += l; /* As in the first entry. */
4010                 data->flags &= ~SF_BEFORE_EOL;
4011             }
4012
4013             /* ANDing the code point leaves at most it, and not in locale, and
4014              * can't match null string */
4015             if (flags & SCF_DO_STCLASS_AND) {
4016                 ssc_cp_and(data->start_class, uc);
4017                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4018                 ssc_clear_locale(data->start_class);
4019             }
4020             else if (flags & SCF_DO_STCLASS_OR) {
4021                 ssc_add_cp(data->start_class, uc);
4022                 ssc_and(pRExC_state, data->start_class, and_withp);
4023             }
4024             flags &= ~SCF_DO_STCLASS;
4025         }
4026         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4027             SSize_t l = STR_LEN(scan);
4028             UV uc = *((U8*)STRING(scan));
4029             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4030                                                      separate code points */
4031
4032             /* Search for fixed substrings supports EXACT only. */
4033             if (flags & SCF_DO_SUBSTR) {
4034                 assert(data);
4035                 SCAN_COMMIT(pRExC_state, data, minlenp);
4036             }
4037             if (UTF) {
4038                 const U8 * const s = (U8 *)STRING(scan);
4039                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4040                 l = utf8_length(s, s + l);
4041             }
4042             if (has_exactf_sharp_s) {
4043                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4044             }
4045             min += l - min_subtract;
4046             assert (min >= 0);
4047             delta += min_subtract;
4048             if (flags & SCF_DO_SUBSTR) {
4049                 data->pos_min += l - min_subtract;
4050                 if (data->pos_min < 0) {
4051                     data->pos_min = 0;
4052                 }
4053                 data->pos_delta += min_subtract;
4054                 if (min_subtract) {
4055                     data->longest = &(data->longest_float);
4056                 }
4057             }
4058             if (OP(scan) == EXACTFL) {
4059                 if (flags & SCF_DO_STCLASS_AND) {
4060                     ssc_flags_and(data->start_class,
4061                                                 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4062                 }
4063                 else if (flags & SCF_DO_STCLASS_OR) {
4064                     ANYOF_FLAGS(data->start_class)
4065                                                 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4066                 }
4067
4068                 /* We don't know what the folds are; it could be anything. XXX
4069                  * Actually, we only support UTF-8 encoding for code points
4070                  * above Latin1, so we could know what those folds are. */
4071                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4072                                                        0,
4073                                                        UV_MAX);
4074             }
4075             else {  /* Non-locale EXACTFish */
4076                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4077                 if (flags & SCF_DO_STCLASS_AND) {
4078                     ssc_clear_locale(data->start_class);
4079                 }
4080                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4081                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4082                                                        know if anything folds
4083                                                        with this */
4084                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4085                                                            PL_fold_latin1[uc]);
4086                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4087                                                       legal under /iaa */
4088                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4089                                 EXACTF_invlist
4090                                     = add_cp_to_invlist(EXACTF_invlist,
4091                                                 LATIN_SMALL_LETTER_SHARP_S);
4092                             }
4093                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4094                                 EXACTF_invlist
4095                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4096                                 EXACTF_invlist
4097                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4098                             }
4099                         }
4100
4101                         /* We also know if there are above-Latin1 code points
4102                          * that fold to this (none legal for ASCII and /iaa) */
4103                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4104                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4105                         {
4106                             /* XXX We could know exactly what does fold to this
4107                              * if the reverse folds are loaded, as currently in
4108                              * S_regclass() */
4109                             _invlist_union(EXACTF_invlist,
4110                                            PL_AboveLatin1,
4111                                            &EXACTF_invlist);
4112                         }
4113                     }
4114                 }
4115                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4116                            know what participates in folds with this, so have
4117                            to assume anything could */
4118
4119                     /* XXX We could know exactly what does fold to this if the
4120                      * reverse folds are loaded, as currently in S_regclass().
4121                      * But we do know that under /iaa nothing in the ASCII
4122                      * range can participate */
4123                     if (OP(scan) == EXACTFA) {
4124                         _invlist_union_complement_2nd(EXACTF_invlist,
4125                                                       PL_Posix_ptrs[_CC_ASCII],
4126                                                       &EXACTF_invlist);
4127                     }
4128                     else {
4129                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4130                                                                0, UV_MAX);
4131                     }
4132                 }
4133             }
4134             if (flags & SCF_DO_STCLASS_AND) {
4135                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4136                 ANYOF_POSIXL_ZERO(data->start_class);
4137                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4138             }
4139             else if (flags & SCF_DO_STCLASS_OR) {
4140                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4141                 ssc_and(pRExC_state, data->start_class, and_withp);
4142             }
4143             flags &= ~SCF_DO_STCLASS;
4144             SvREFCNT_dec(EXACTF_invlist);
4145         }
4146         else if (REGNODE_VARIES(OP(scan))) {
4147             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4148             I32 fl = 0, f = flags;
4149             regnode * const oscan = scan;
4150             regnode_ssc this_class;
4151             regnode_ssc *oclass = NULL;
4152             I32 next_is_eval = 0;
4153
4154             switch (PL_regkind[OP(scan)]) {
4155             case WHILEM:                /* End of (?:...)* . */
4156                 scan = NEXTOPER(scan);
4157                 goto finish;
4158             case PLUS:
4159                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4160                     next = NEXTOPER(scan);
4161                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4162                         mincount = 1;
4163                         maxcount = REG_INFTY;
4164                         next = regnext(scan);
4165                         scan = NEXTOPER(scan);
4166                         goto do_curly;
4167                     }
4168                 }
4169                 if (flags & SCF_DO_SUBSTR)
4170                     data->pos_min++;
4171                 min++;
4172                 /* Fall through. */
4173             case STAR:
4174                 if (flags & SCF_DO_STCLASS) {
4175                     mincount = 0;
4176                     maxcount = REG_INFTY;
4177                     next = regnext(scan);
4178                     scan = NEXTOPER(scan);
4179                     goto do_curly;
4180                 }
4181                 is_inf = is_inf_internal = 1;
4182                 scan = regnext(scan);
4183                 if (flags & SCF_DO_SUBSTR) {
4184                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4185                     data->longest = &(data->longest_float);
4186                 }
4187                 goto optimize_curly_tail;
4188             case CURLY:
4189                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4190                     && (scan->flags == stopparen))
4191                 {
4192                     mincount = 1;
4193                     maxcount = 1;
4194                 } else {
4195                     mincount = ARG1(scan);
4196                     maxcount = ARG2(scan);
4197                 }
4198                 next = regnext(scan);
4199                 if (OP(scan) == CURLYX) {
4200                     I32 lp = (data ? *(data->last_closep) : 0);
4201                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4202                 }
4203                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4204                 next_is_eval = (OP(scan) == EVAL);
4205               do_curly:
4206                 if (flags & SCF_DO_SUBSTR) {
4207                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4208                     pos_before = data->pos_min;
4209                 }
4210                 if (data) {
4211                     fl = data->flags;
4212                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4213                     if (is_inf)
4214                         data->flags |= SF_IS_INF;
4215                 }
4216                 if (flags & SCF_DO_STCLASS) {
4217                     ssc_init(pRExC_state, &this_class);
4218                     oclass = data->start_class;
4219                     data->start_class = &this_class;
4220                     f |= SCF_DO_STCLASS_AND;
4221                     f &= ~SCF_DO_STCLASS_OR;
4222                 }
4223                 /* Exclude from super-linear cache processing any {n,m}
4224                    regops for which the combination of input pos and regex
4225                    pos is not enough information to determine if a match
4226                    will be possible.
4227
4228                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4229                    regex pos at the \s*, the prospects for a match depend not
4230                    only on the input position but also on how many (bar\s*)
4231                    repeats into the {4,8} we are. */
4232                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4233                     f &= ~SCF_WHILEM_VISITED_POS;
4234
4235                 /* This will finish on WHILEM, setting scan, or on NULL: */
4236                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4237                                       last, data, stopparen, recursed_depth, NULL,
4238                                       (mincount == 0
4239                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4240
4241                 if (flags & SCF_DO_STCLASS)
4242                     data->start_class = oclass;
4243                 if (mincount == 0 || minnext == 0) {
4244                     if (flags & SCF_DO_STCLASS_OR) {
4245                         ssc_or(pRExC_state, data->start_class, &this_class);
4246                     }
4247                     else if (flags & SCF_DO_STCLASS_AND) {
4248                         /* Switch to OR mode: cache the old value of
4249                          * data->start_class */
4250                         INIT_AND_WITHP;
4251                         StructCopy(data->start_class, and_withp, regnode_ssc);
4252                         flags &= ~SCF_DO_STCLASS_AND;
4253                         StructCopy(&this_class, data->start_class, regnode_ssc);
4254                         flags |= SCF_DO_STCLASS_OR;
4255                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4256                     }
4257                 } else {                /* Non-zero len */
4258                     if (flags & SCF_DO_STCLASS_OR) {
4259                         ssc_or(pRExC_state, data->start_class, &this_class);
4260                         ssc_and(pRExC_state, data->start_class, and_withp);
4261                     }
4262                     else if (flags & SCF_DO_STCLASS_AND)
4263                         ssc_and(pRExC_state, data->start_class, &this_class);
4264                     flags &= ~SCF_DO_STCLASS;
4265                 }
4266                 if (!scan)              /* It was not CURLYX, but CURLY. */
4267                     scan = next;
4268                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4269                     /* ? quantifier ok, except for (?{ ... }) */
4270                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4271                     && (minnext == 0) && (deltanext == 0)
4272                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4273                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
4274                 {
4275                     /* Fatal warnings may leak the regexp without this: */
4276                     SAVEFREESV(RExC_rx_sv);
4277                     ckWARNreg(RExC_parse,
4278                               "Quantifier unexpected on zero-length expression");
4279                     (void)ReREFCNT_inc(RExC_rx_sv);
4280                 }
4281
4282                 min += minnext * mincount;
4283                 is_inf_internal |= deltanext == SSize_t_MAX
4284                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
4285                 is_inf |= is_inf_internal;
4286                 if (is_inf)
4287                     delta = SSize_t_MAX;
4288                 else
4289                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
4290
4291                 /* Try powerful optimization CURLYX => CURLYN. */
4292                 if (  OP(oscan) == CURLYX && data
4293                       && data->flags & SF_IN_PAR
4294                       && !(data->flags & SF_HAS_EVAL)
4295                       && !deltanext && minnext == 1 ) {
4296                     /* Try to optimize to CURLYN.  */
4297                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4298                     regnode * const nxt1 = nxt;
4299 #ifdef DEBUGGING
4300                     regnode *nxt2;
4301 #endif
4302
4303                     /* Skip open. */
4304                     nxt = regnext(nxt);
4305                     if (!REGNODE_SIMPLE(OP(nxt))
4306                         && !(PL_regkind[OP(nxt)] == EXACT
4307                              && STR_LEN(nxt) == 1))
4308                         goto nogo;
4309 #ifdef DEBUGGING
4310                     nxt2 = nxt;
4311 #endif
4312                     nxt = regnext(nxt);
4313                     if (OP(nxt) != CLOSE)
4314                         goto nogo;
4315                     if (RExC_open_parens) {
4316                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4317                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4318                     }
4319                     /* Now we know that nxt2 is the only contents: */
4320                     oscan->flags = (U8)ARG(nxt);
4321                     OP(oscan) = CURLYN;
4322                     OP(nxt1) = NOTHING; /* was OPEN. */
4323
4324 #ifdef DEBUGGING
4325                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4326                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4327                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4328                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4329                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4330                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4331 #endif
4332                 }
4333               nogo:
4334
4335                 /* Try optimization CURLYX => CURLYM. */
4336                 if (  OP(oscan) == CURLYX && data
4337                       && !(data->flags & SF_HAS_PAR)
4338                       && !(data->flags & SF_HAS_EVAL)
4339                       && !deltanext     /* atom is fixed width */
4340                       && minnext != 0   /* CURLYM can't handle zero width */
4341                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4342                 ) {
4343                     /* XXXX How to optimize if data == 0? */
4344                     /* Optimize to a simpler form.  */
4345                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4346                     regnode *nxt2;
4347
4348                     OP(oscan) = CURLYM;
4349                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4350                             && (OP(nxt2) != WHILEM))
4351                         nxt = nxt2;
4352                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4353                     /* Need to optimize away parenths. */
4354                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4355                         /* Set the parenth number.  */
4356                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4357
4358                         oscan->flags = (U8)ARG(nxt);
4359                         if (RExC_open_parens) {
4360                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4361                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4362                         }
4363                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4364                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4365
4366 #ifdef DEBUGGING
4367                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4368                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4369                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4370                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4371 #endif
4372 #if 0
4373                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4374                             regnode *nnxt = regnext(nxt1);
4375                             if (nnxt == nxt) {
4376                                 if (reg_off_by_arg[OP(nxt1)])
4377                                     ARG_SET(nxt1, nxt2 - nxt1);
4378                                 else if (nxt2 - nxt1 < U16_MAX)
4379                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4380                                 else
4381                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4382                             }
4383                             nxt1 = nnxt;
4384                         }
4385 #endif
4386                         /* Optimize again: */
4387                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4388                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4389                     }
4390                     else
4391                         oscan->flags = 0;
4392                 }
4393                 else if ((OP(oscan) == CURLYX)
4394                          && (flags & SCF_WHILEM_VISITED_POS)
4395                          /* See the comment on a similar expression above.
4396                             However, this time it's not a subexpression
4397                             we care about, but the expression itself. */
4398                          && (maxcount == REG_INFTY)
4399                          && data && ++data->whilem_c < 16) {
4400                     /* This stays as CURLYX, we can put the count/of pair. */
4401                     /* Find WHILEM (as in regexec.c) */
4402                     regnode *nxt = oscan + NEXT_OFF(oscan);
4403
4404                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4405                         nxt += ARG(nxt);
4406                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4407                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4408                 }
4409                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4410                     pars++;
4411                 if (flags & SCF_DO_SUBSTR) {
4412                     SV *last_str = NULL;
4413                     int counted = mincount != 0;
4414
4415                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4416                         SSize_t b = pos_before >= data->last_start_min
4417                             ? pos_before : data->last_start_min;
4418                         STRLEN l;
4419                         const char * const s = SvPV_const(data->last_found, l);
4420                         SSize_t old = b - data->last_start_min;
4421
4422                         if (UTF)
4423                             old = utf8_hop((U8*)s, old) - (U8*)s;
4424                         l -= old;
4425                         /* Get the added string: */
4426                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4427                         if (deltanext == 0 && pos_before == b) {
4428                             /* What was added is a constant string */
4429                             if (mincount > 1) {
4430                                 SvGROW(last_str, (mincount * l) + 1);
4431                                 repeatcpy(SvPVX(last_str) + l,
4432                                           SvPVX_const(last_str), l, mincount - 1);
4433                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4434                                 /* Add additional parts. */
4435                                 SvCUR_set(data->last_found,
4436                                           SvCUR(data->last_found) - l);
4437                                 sv_catsv(data->last_found, last_str);
4438                                 {
4439                                     SV * sv = data->last_found;
4440                                     MAGIC *mg =
4441                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4442                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4443                                     if (mg && mg->mg_len >= 0)
4444                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4445                                 }
4446                                 data->last_end += l * (mincount - 1);
4447                             }
4448                         } else {
4449                             /* start offset must point into the last copy */
4450                             data->last_start_min += minnext * (mincount - 1);
4451                             data->last_start_max += is_inf ? SSize_t_MAX
4452                                 : (maxcount - 1) * (minnext + data->pos_delta);
4453                         }
4454                     }
4455                     /* It is counted once already... */
4456                     data->pos_min += minnext * (mincount - counted);
4457 #if 0
4458 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4459                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4460                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4461     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4462     (UV)mincount);
4463 if (deltanext != SSize_t_MAX)
4464 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4465     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4466           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4467 #endif
4468                     if (deltanext == SSize_t_MAX ||
4469                         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4470                         data->pos_delta = SSize_t_MAX;
4471                     else
4472                         data->pos_delta += - counted * deltanext +
4473                         (minnext + deltanext) * maxcount - minnext * mincount;
4474                     if (mincount != maxcount) {
4475                          /* Cannot extend fixed substrings found inside
4476                             the group.  */
4477                         SCAN_COMMIT(pRExC_state,data,minlenp);
4478                         if (mincount && last_str) {
4479                             SV * const sv = data->last_found;
4480                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4481                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4482
4483                             if (mg)
4484                                 mg->mg_len = -1;
4485                             sv_setsv(sv, last_str);
4486                             data->last_end = data->pos_min;
4487                             data->last_start_min =
4488                                 data->pos_min - CHR_SVLEN(last_str);
4489                             data->last_start_max = is_inf
4490                                 ? SSize_t_MAX
4491                                 : data->pos_min + data->pos_delta
4492                                 - CHR_SVLEN(last_str);
4493                         }
4494                         data->longest = &(data->longest_float);
4495                     }
4496                     SvREFCNT_dec(last_str);
4497                 }
4498                 if (data && (fl & SF_HAS_EVAL))
4499                     data->flags |= SF_HAS_EVAL;
4500               optimize_curly_tail:
4501                 if (OP(oscan) != CURLYX) {
4502                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4503                            && NEXT_OFF(next))
4504                         NEXT_OFF(oscan) += NEXT_OFF(next);
4505                 }
4506                 continue;
4507
4508             default:
4509 #ifdef DEBUGGING
4510                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4511                                                                     OP(scan));
4512 #endif
4513             case REF:
4514             case CLUMP:
4515                 if (flags & SCF_DO_SUBSTR) {
4516                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4517                     data->longest = &(data->longest_float);
4518                 }
4519                 is_inf = is_inf_internal = 1;
4520                 if (flags & SCF_DO_STCLASS_OR) {
4521                     if (OP(scan) == CLUMP) {
4522                         /* Actually is any start char, but very few code points
4523                          * aren't start characters */
4524                         ssc_match_all_cp(data->start_class);
4525                     }
4526                     else {
4527                         ssc_anything(data->start_class);
4528                     }
4529                 }
4530                 flags &= ~SCF_DO_STCLASS;
4531                 break;
4532             }
4533         }
4534         else if (OP(scan) == LNBREAK) {
4535             if (flags & SCF_DO_STCLASS) {
4536                 if (flags & SCF_DO_STCLASS_AND) {
4537                     ssc_intersection(data->start_class,
4538                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4539                     ssc_clear_locale(data->start_class);
4540                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4541                 }
4542                 else if (flags & SCF_DO_STCLASS_OR) {
4543                     ssc_union(data->start_class,
4544                               PL_XPosix_ptrs[_CC_VERTSPACE],
4545                               FALSE);
4546                     ssc_and(pRExC_state, data->start_class, and_withp);
4547                 }
4548                 flags &= ~SCF_DO_STCLASS;
4549             }
4550             min++;
4551             delta++;    /* Because of the 2 char string cr-lf */
4552             if (flags & SCF_DO_SUBSTR) {
4553                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4554                 data->pos_min += 1;
4555                 data->pos_delta += 1;
4556                 data->longest = &(data->longest_float);
4557             }
4558         }
4559         else if (REGNODE_SIMPLE(OP(scan))) {
4560
4561             if (flags & SCF_DO_SUBSTR) {
4562                 SCAN_COMMIT(pRExC_state,data,minlenp);
4563                 data->pos_min++;
4564             }
4565             min++;
4566             if (flags & SCF_DO_STCLASS) {
4567                 bool invert = 0;
4568                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4569                 U8 classnum;
4570                 U8 namedclass;
4571
4572                 if (flags & SCF_DO_STCLASS_AND) {
4573                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4574                 }
4575
4576                 /* Some of the logic below assumes that switching
4577                    locale on will only add false positives. */
4578                 switch (OP(scan)) {
4579
4580                 default:
4581 #ifdef DEBUGGING
4582                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4583 #endif
4584                 case CANY:
4585                 case SANY:
4586                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4587                         ssc_match_all_cp(data->start_class);
4588                     break;
4589
4590                 case REG_ANY:
4591                     {
4592                         SV* REG_ANY_invlist = _new_invlist(2);
4593                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4594                                                             '\n');
4595                         if (flags & SCF_DO_STCLASS_OR) {
4596                             ssc_union(data->start_class,
4597                                       REG_ANY_invlist,
4598                                       TRUE /* TRUE => invert, hence all but \n
4599                                             */
4600                                       );
4601                         }
4602                         else if (flags & SCF_DO_STCLASS_AND) {
4603                             ssc_intersection(data->start_class,
4604                                              REG_ANY_invlist,
4605                                              TRUE  /* TRUE => invert */
4606                                              );
4607                             ssc_clear_locale(data->start_class);
4608                         }
4609                         SvREFCNT_dec_NN(REG_ANY_invlist);
4610                     }
4611                     break;
4612
4613                 case ANYOF_WARN_SUPER:
4614                 case ANYOF:
4615                     if (flags & SCF_DO_STCLASS_AND)
4616                         ssc_and(pRExC_state, data->start_class,
4617                                 (regnode_ssc*) scan);
4618                     else
4619                         ssc_or(pRExC_state, data->start_class,
4620                                                           (regnode_ssc*)scan);
4621                     break;
4622
4623                 case NPOSIXL:
4624                     invert = 1;
4625                     /* FALL THROUGH */
4626
4627                 case POSIXL:
4628                     classnum = FLAGS(scan);
4629                     namedclass = classnum_to_namedclass(classnum) + invert;
4630                     if (flags & SCF_DO_STCLASS_AND) {
4631                         bool was_there = cBOOL(
4632                                           ANYOF_POSIXL_TEST(data->start_class,
4633                                                                  namedclass));
4634                         ANYOF_POSIXL_ZERO(data->start_class);
4635                         if (was_there) {    /* Do an AND */
4636                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4637                         }
4638                         /* No individual code points can now match */
4639                         data->start_class->invlist
4640                                                 = sv_2mortal(_new_invlist(0));
4641                     }
4642                     else {
4643                         int complement = namedclass + ((invert) ? -1 : 1);
4644
4645                         assert(flags & SCF_DO_STCLASS_OR);
4646
4647                         /* If the complement of this class was already there,
4648                          * the result is that they match all code points,
4649                          * (\d + \D == everything).  Remove the classes from
4650                          * future consideration.  Locale is not relevant in
4651                          * this case */
4652                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4653                             ssc_match_all_cp(data->start_class);
4654                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4655                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4656                             if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4657                             {
4658                                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4659                             }
4660                         }
4661                         else {  /* The usual case; just add this class to the
4662                                    existing set */
4663                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4664                             ANYOF_FLAGS(data->start_class)
4665                                                 |= ANYOF_LOCALE|ANYOF_POSIXL;
4666                         }
4667                     }
4668                     break;
4669
4670                 case NPOSIXA:   /* For these, we always know the exact set of
4671                                    what's matched */
4672                     invert = 1;
4673                     /* FALL THROUGH */
4674                 case POSIXA:
4675                     classnum = FLAGS(scan);
4676                     my_invlist = PL_Posix_ptrs[classnum];
4677                     goto join_posix;
4678
4679                 case NPOSIXD:
4680                 case NPOSIXU:
4681                     invert = 1;
4682                     /* FALL THROUGH */
4683                 case POSIXD:
4684                 case POSIXU:
4685                     classnum = FLAGS(scan);
4686
4687                     /* If we know all the code points that match the class, use
4688                      * that; otherwise use the Latin1 code points, plus we have
4689                      * to assume that it could match anything above Latin1 */
4690                     if (PL_XPosix_ptrs[classnum]) {
4691                         my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4692                     }
4693                     else {
4694                         _invlist_union(PL_L1Posix_ptrs[classnum],
4695                                        PL_AboveLatin1, &my_invlist);
4696                     }
4697
4698                     /* NPOSIXD matches all upper Latin1 code points unless the
4699                      * target string being matched is UTF-8, which is
4700                      * unknowable until match time */
4701                     if (PL_regkind[OP(scan)] == NPOSIXD) {
4702                         _invlist_union_complement_2nd(my_invlist,
4703                                         PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4704                     }
4705
4706                   join_posix:
4707
4708                     if (flags & SCF_DO_STCLASS_AND) {
4709                         ssc_intersection(data->start_class, my_invlist, invert);
4710                         ssc_clear_locale(data->start_class);
4711                     }
4712                     else {
4713                         assert(flags & SCF_DO_STCLASS_OR);
4714                         ssc_union(data->start_class, my_invlist, invert);
4715                     }
4716                 }
4717                 if (flags & SCF_DO_STCLASS_OR)
4718                     ssc_and(pRExC_state, data->start_class, and_withp);
4719                 flags &= ~SCF_DO_STCLASS;
4720             }
4721         }
4722         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4723             data->flags |= (OP(scan) == MEOL
4724                             ? SF_BEFORE_MEOL
4725                             : SF_BEFORE_SEOL);
4726             SCAN_COMMIT(pRExC_state, data, minlenp);
4727
4728         }
4729         else if (  PL_regkind[OP(scan)] == BRANCHJ
4730                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4731                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4732                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4733             if ( OP(scan) == UNLESSM &&
4734                  scan->flags == 0 &&
4735                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4736                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4737             ) {
4738                 regnode *opt;
4739                 regnode *upto= regnext(scan);
4740                 DEBUG_PARSE_r({
4741                     SV * const mysv_val=sv_newmortal();
4742                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4743
4744                     /*DEBUG_PARSE_MSG("opfail");*/
4745                     regprop(RExC_rx, mysv_val, upto);
4746                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4747                                   SvPV_nolen_const(mysv_val),
4748                                   (IV)REG_NODE_NUM(upto),
4749                                   (IV)(upto - scan)
4750                     );
4751                 });
4752                 OP(scan) = OPFAIL;
4753                 NEXT_OFF(scan) = upto - scan;
4754                 for (opt= scan + 1; opt < upto ; opt++)
4755                     OP(opt) = OPTIMIZED;
4756                 scan= upto;
4757                 continue;
4758             }
4759             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4760                 || OP(scan) == UNLESSM )
4761             {
4762                 /* Negative Lookahead/lookbehind
4763                    In this case we can't do fixed string optimisation.
4764                 */
4765
4766                 SSize_t deltanext, minnext, fake = 0;
4767                 regnode *nscan;
4768                 regnode_ssc intrnl;
4769                 int f = 0;
4770
4771                 data_fake.flags = 0;
4772                 if (data) {
4773                     data_fake.whilem_c = data->whilem_c;
4774                     data_fake.last_closep = data->last_closep;
4775                 }
4776                 else
4777                     data_fake.last_closep = &fake;
4778                 data_fake.pos_delta = delta;
4779                 if ( flags & SCF_DO_STCLASS && !scan->flags
4780                      && OP(scan) == IFMATCH ) { /* Lookahead */
4781                     ssc_init(pRExC_state, &intrnl);
4782                     data_fake.start_class = &intrnl;
4783                     f |= SCF_DO_STCLASS_AND;
4784                 }
4785                 if (flags & SCF_WHILEM_VISITED_POS)
4786                     f |= SCF_WHILEM_VISITED_POS;
4787                 next = regnext(scan);
4788                 nscan = NEXTOPER(NEXTOPER(scan));
4789                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4790                     last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4791                 if (scan->flags) {
4792                     if (deltanext) {
4793                         FAIL("Variable length lookbehind not implemented");
4794                     }
4795                     else if (minnext > (I32)U8_MAX) {
4796                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4797                     }
4798                     scan->flags = (U8)minnext;
4799                 }
4800                 if (data) {
4801                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4802                         pars++;
4803                     if (data_fake.flags & SF_HAS_EVAL)
4804                         data->flags |= SF_HAS_EVAL;
4805                     data->whilem_c = data_fake.whilem_c;
4806                 }
4807                 if (f & SCF_DO_STCLASS_AND) {
4808                     if (flags & SCF_DO_STCLASS_OR) {
4809                         /* OR before, AND after: ideally we would recurse with
4810                          * data_fake to get the AND applied by study of the
4811                          * remainder of the pattern, and then derecurse;
4812                          * *** HACK *** for now just treat as "no information".
4813                          * See [perl #56690].
4814                          */
4815                         ssc_init(pRExC_state, data->start_class);
4816                     }  else {
4817                         /* AND before and after: combine and continue */
4818                         ssc_and(pRExC_state, data->start_class, &intrnl);
4819                     }
4820                 }
4821             }
4822 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4823             else {
4824                 /* Positive Lookahead/lookbehind
4825                    In this case we can do fixed string optimisation,
4826                    but we must be careful about it. Note in the case of
4827                    lookbehind the positions will be offset by the minimum
4828                    length of the pattern, something we won't know about
4829                    until after the recurse.
4830                 */
4831                 SSize_t deltanext, fake = 0;
4832                 regnode *nscan;
4833                 regnode_ssc intrnl;
4834                 int f = 0;
4835                 /* We use SAVEFREEPV so that when the full compile
4836                     is finished perl will clean up the allocated
4837                     minlens when it's all done. This way we don't
4838                     have to worry about freeing them when we know
4839                     they wont be used, which would be a pain.
4840                  */
4841                 SSize_t *minnextp;
4842                 Newx( minnextp, 1, SSize_t );
4843                 SAVEFREEPV(minnextp);
4844
4845                 if (data) {
4846                     StructCopy(data, &data_fake, scan_data_t);
4847                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4848                         f |= SCF_DO_SUBSTR;
4849                         if (scan->flags)
4850                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4851                         data_fake.last_found=newSVsv(data->last_found);
4852                     }
4853                 }
4854                 else
4855                     data_fake.last_closep = &fake;
4856                 data_fake.flags = 0;
4857                 data_fake.pos_delta = delta;
4858                 if (is_inf)
4859                     data_fake.flags |= SF_IS_INF;
4860                 if ( flags & SCF_DO_STCLASS && !scan->flags
4861                      && OP(scan) == IFMATCH ) { /* Lookahead */
4862                     ssc_init(pRExC_state, &intrnl);
4863                     data_fake.start_class = &intrnl;
4864                     f |= SCF_DO_STCLASS_AND;
4865                 }
4866                 if (flags & SCF_WHILEM_VISITED_POS)
4867                     f |= SCF_WHILEM_VISITED_POS;
4868                 next = regnext(scan);
4869                 nscan = NEXTOPER(NEXTOPER(scan));
4870
4871                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4872                     last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4873                 if (scan->flags) {
4874                     if (deltanext) {
4875                         FAIL("Variable length lookbehind not implemented");
4876                     }
4877                     else if (*minnextp > (I32)U8_MAX) {
4878                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4879                     }
4880                     scan->flags = (U8)*minnextp;
4881                 }
4882
4883                 *minnextp += min;
4884
4885                 if (f & SCF_DO_STCLASS_AND) {
4886                     ssc_and(pRExC_state, data->start_class, &intrnl);
4887                 }
4888                 if (data) {
4889                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4890                         pars++;
4891                     if (data_fake.flags & SF_HAS_EVAL)
4892                         data->flags |= SF_HAS_EVAL;
4893                     data->whilem_c = data_fake.whilem_c;
4894                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4895                         if (RExC_rx->minlen<*minnextp)
4896                             RExC_rx->minlen=*minnextp;
4897                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4898                         SvREFCNT_dec_NN(data_fake.last_found);
4899
4900                         if ( data_fake.minlen_fixed != minlenp )
4901                         {
4902                             data->offset_fixed= data_fake.offset_fixed;
4903                             data->minlen_fixed= data_fake.minlen_fixed;
4904                             data->lookbehind_fixed+= scan->flags;
4905                         }
4906                         if ( data_fake.minlen_float != minlenp )
4907                         {
4908                             data->minlen_float= data_fake.minlen_float;
4909                             data->offset_float_min=data_fake.offset_float_min;
4910                             data->offset_float_max=data_fake.offset_float_max;
4911                             data->lookbehind_float+= scan->flags;
4912                         }
4913                     }
4914                 }
4915             }
4916 #endif
4917         }
4918         else if (OP(scan) == OPEN) {
4919             if (stopparen != (I32)ARG(scan))
4920                 pars++;
4921         }
4922         else if (OP(scan) == CLOSE) {
4923             if (stopparen == (I32)ARG(scan)) {
4924                 break;
4925             }
4926             if ((I32)ARG(scan) == is_par) {
4927                 next = regnext(scan);
4928
4929                 if ( next && (OP(next) != WHILEM) && next < last)
4930                     is_par = 0;         /* Disable optimization */
4931             }
4932             if (data)
4933                 *(data->last_closep) = ARG(scan);
4934         }
4935         else if (OP(scan) == EVAL) {
4936                 if (data)
4937                     data->flags |= SF_HAS_EVAL;
4938         }
4939         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4940             if (flags & SCF_DO_SUBSTR) {
4941                 SCAN_COMMIT(pRExC_state,data,minlenp);
4942                 flags &= ~SCF_DO_SUBSTR;
4943             }
4944             if (data && OP(scan)==ACCEPT) {
4945                 data->flags |= SCF_SEEN_ACCEPT;
4946                 if (stopmin > min)
4947                     stopmin = min;
4948             }
4949         }
4950         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4951         {
4952                 if (flags & SCF_DO_SUBSTR) {
4953                     SCAN_COMMIT(pRExC_state,data,minlenp);
4954                     data->longest = &(data->longest_float);
4955                 }
4956                 is_inf = is_inf_internal = 1;
4957                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4958                     ssc_anything(data->start_class);
4959                 flags &= ~SCF_DO_STCLASS;
4960         }
4961         else if (OP(scan) == GPOS) {
4962             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4963                 !(delta || is_inf || (data && data->pos_delta)))
4964             {
4965                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4966                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4967                 if (RExC_rx->gofs < (STRLEN)min)
4968                     RExC_rx->gofs = min;
4969             } else {
4970                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4971                 RExC_rx->gofs = 0;
4972             }
4973         }
4974 #ifdef TRIE_STUDY_OPT
4975 #ifdef FULL_TRIE_STUDY
4976         else if (PL_regkind[OP(scan)] == TRIE) {
4977             /* NOTE - There is similar code to this block above for handling
4978                BRANCH nodes on the initial study.  If you change stuff here
4979                check there too. */
4980             regnode *trie_node= scan;
4981             regnode *tail= regnext(scan);
4982             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4983             SSize_t max1 = 0, min1 = SSize_t_MAX;
4984             regnode_ssc accum;
4985
4986             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4987                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4988             if (flags & SCF_DO_STCLASS)
4989                 ssc_init_zero(pRExC_state, &accum);
4990
4991             if (!trie->jump) {
4992                 min1= trie->minlen;
4993                 max1= trie->maxlen;
4994             } else {
4995                 const regnode *nextbranch= NULL;
4996                 U32 word;
4997
4998                 for ( word=1 ; word <= trie->wordcount ; word++)
4999                 {
5000                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5001                     regnode_ssc this_class;
5002
5003                     data_fake.flags = 0;
5004                     if (data) {
5005                         data_fake.whilem_c = data->whilem_c;
5006                         data_fake.last_closep = data->last_closep;
5007                     }
5008                     else
5009                         data_fake.last_closep = &fake;
5010                     data_fake.pos_delta = delta;
5011                     if (flags & SCF_DO_STCLASS) {
5012                         ssc_init(pRExC_state, &this_class);
5013                         data_fake.start_class = &this_class;
5014                         f = SCF_DO_STCLASS_AND;
5015                     }
5016                     if (flags & SCF_WHILEM_VISITED_POS)
5017                         f |= SCF_WHILEM_VISITED_POS;
5018
5019                     if (trie->jump[word]) {
5020                         if (!nextbranch)
5021                             nextbranch = trie_node + trie->jump[0];
5022                         scan= trie_node + trie->jump[word];
5023                         /* We go from the jump point to the branch that follows
5024                            it. Note this means we need the vestigal unused branches
5025                            even though they arent otherwise used.
5026                          */
5027                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5028                             &deltanext, (regnode *)nextbranch, &data_fake,
5029                             stopparen, recursed_depth, NULL, f,depth+1);
5030                     }
5031                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5032                         nextbranch= regnext((regnode*)nextbranch);
5033
5034                     if (min1 > (SSize_t)(minnext + trie->minlen))
5035                         min1 = minnext + trie->minlen;
5036                     if (deltanext == SSize_t_MAX) {
5037                         is_inf = is_inf_internal = 1;
5038                         max1 = SSize_t_MAX;
5039                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5040                         max1 = minnext + deltanext + trie->maxlen;
5041
5042                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5043                         pars++;
5044                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5045                         if ( stopmin > min + min1)
5046                             stopmin = min + min1;
5047                         flags &= ~SCF_DO_SUBSTR;
5048                         if (data)
5049                             data->flags |= SCF_SEEN_ACCEPT;
5050                     }
5051                     if (data) {
5052                         if (data_fake.flags & SF_HAS_EVAL)
5053                             data->flags |= SF_HAS_EVAL;
5054                         data->whilem_c = data_fake.whilem_c;
5055                     }
5056                     if (flags & SCF_DO_STCLASS)
5057                         ssc_or(pRExC_state, &accum, &this_class);
5058                 }
5059             }
5060             if (flags & SCF_DO_SUBSTR) {
5061                 data->pos_min += min1;
5062                 data->pos_delta += max1 - min1;
5063                 if (max1 != min1 || is_inf)
5064                     data->longest = &(data->longest_float);
5065             }
5066             min += min1;
5067             delta += max1 - min1;
5068             if (flags & SCF_DO_STCLASS_OR) {
5069                 ssc_or(pRExC_state, data->start_class, &accum);
5070                 if (min1) {
5071                     ssc_and(pRExC_state, data->start_class, and_withp);
5072                     flags &= ~SCF_DO_STCLASS;
5073                 }
5074             }
5075             else if (flags & SCF_DO_STCLASS_AND) {
5076                 if (min1) {
5077                     ssc_and(pRExC_state, data->start_class, &accum);
5078                     flags &= ~SCF_DO_STCLASS;
5079                 }
5080                 else {
5081                     /* Switch to OR mode: cache the old value of
5082                      * data->start_class */
5083                     INIT_AND_WITHP;
5084                     StructCopy(data->start_class, and_withp, regnode_ssc);
5085                     flags &= ~SCF_DO_STCLASS_AND;
5086                     StructCopy(&accum, data->start_class, regnode_ssc);
5087                     flags |= SCF_DO_STCLASS_OR;
5088                 }
5089             }
5090             scan= tail;
5091             continue;
5092         }
5093 #else
5094         else if (PL_regkind[OP(scan)] == TRIE) {
5095             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5096             U8*bang=NULL;
5097
5098             min += trie->minlen;
5099             delta += (trie->maxlen - trie->minlen);
5100             flags &= ~SCF_DO_STCLASS; /* xxx */
5101             if (flags & SCF_DO_SUBSTR) {
5102                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
5103                 data->pos_min += trie->minlen;
5104                 data->pos_delta += (trie->maxlen - trie->minlen);
5105                 if (trie->maxlen != trie->minlen)
5106                     data->longest = &(data->longest_float);
5107             }
5108             if (trie->jump) /* no more substrings -- for now /grr*/
5109                 flags &= ~SCF_DO_SUBSTR;
5110         }
5111 #endif /* old or new */
5112 #endif /* TRIE_STUDY_OPT */
5113
5114         /* Else: zero-length, ignore. */
5115         scan = regnext(scan);
5116     }
5117     /* If we are exiting a recursion we can unset its recursed bit
5118      * and allow ourselves to enter it again - no danger of an
5119      * infinite loop there.
5120     if (stopparen > -1 && recursed) {
5121         DEBUG_STUDYDATA("unset:", data,depth);
5122         PAREN_UNSET( recursed, stopparen);
5123     }
5124     */
5125     if (frame) {
5126         DEBUG_STUDYDATA("frame-end:",data,depth);
5127         DEBUG_PEEP("fend", scan, depth);
5128         /* restore previous context */
5129         last = frame->last;
5130         scan = frame->next;
5131         stopparen = frame->stop;
5132         recursed_depth = frame->prev_recursed_depth;
5133         depth = depth - 1;
5134
5135         frame = frame->prev;
5136         goto fake_study_recurse;
5137     }
5138
5139   finish:
5140     assert(!frame);
5141     DEBUG_STUDYDATA("pre-fin:",data,depth);
5142
5143     *scanp = scan;
5144     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5145     if (flags & SCF_DO_SUBSTR && is_inf)
5146         data->pos_delta = SSize_t_MAX - data->pos_min;
5147     if (is_par > (I32)U8_MAX)
5148         is_par = 0;
5149     if (is_par && pars==1 && data) {
5150         data->flags |= SF_IN_PAR;
5151         data->flags &= ~SF_HAS_PAR;
5152     }
5153     else if (pars && data) {
5154         data->flags |= SF_HAS_PAR;
5155         data->flags &= ~SF_IN_PAR;
5156     }
5157     if (flags & SCF_DO_STCLASS_OR)
5158         ssc_and(pRExC_state, data->start_class, and_withp);
5159     if (flags & SCF_TRIE_RESTUDY)
5160         data->flags |=  SCF_TRIE_RESTUDY;
5161
5162     DEBUG_STUDYDATA("post-fin:",data,depth);
5163
5164     return min < stopmin ? min : stopmin;
5165 }
5166
5167 STATIC U32
5168 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5169 {
5170     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5171
5172     PERL_ARGS_ASSERT_ADD_DATA;
5173
5174     Renewc(RExC_rxi->data,
5175            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5176            char, struct reg_data);
5177     if(count)
5178         Renew(RExC_rxi->data->what, count + n, U8);
5179     else
5180         Newx(RExC_rxi->data->what, n, U8);
5181     RExC_rxi->data->count = count + n;
5182     Copy(s, RExC_rxi->data->what + count, n, U8);
5183     return count;
5184 }
5185
5186 /*XXX: todo make this not included in a non debugging perl */
5187 #ifndef PERL_IN_XSUB_RE
5188 void
5189 Perl_reginitcolors(pTHX)
5190 {
5191     dVAR;
5192     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5193     if (s) {
5194         char *t = savepv(s);
5195         int i = 0;
5196         PL_colors[0] = t;
5197         while (++i < 6) {
5198             t = strchr(t, '\t');
5199             if (t) {
5200                 *t = '\0';
5201                 PL_colors[i] = ++t;
5202             }
5203             else
5204                 PL_colors[i] = t = (char *)"";
5205         }
5206     } else {
5207         int i = 0;
5208         while (i < 6)
5209             PL_colors[i++] = (char *)"";
5210     }
5211     PL_colorset = 1;
5212 }
5213 #endif
5214
5215
5216 #ifdef TRIE_STUDY_OPT
5217 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5218     STMT_START {                                            \
5219         if (                                                \
5220               (data.flags & SCF_TRIE_RESTUDY)               \
5221               && ! restudied++                              \
5222         ) {                                                 \
5223             dOsomething;                                    \
5224             goto reStudy;                                   \
5225         }                                                   \
5226     } STMT_END
5227 #else
5228 #define CHECK_RESTUDY_GOTO_butfirst
5229 #endif
5230
5231 /*
5232  * pregcomp - compile a regular expression into internal code
5233  *
5234  * Decides which engine's compiler to call based on the hint currently in
5235  * scope
5236  */
5237
5238 #ifndef PERL_IN_XSUB_RE
5239
5240 /* return the currently in-scope regex engine (or the default if none)  */
5241
5242 regexp_engine const *
5243 Perl_current_re_engine(pTHX)
5244 {
5245     dVAR;
5246
5247     if (IN_PERL_COMPILETIME) {
5248         HV * const table = GvHV(PL_hintgv);
5249         SV **ptr;
5250
5251         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5252             return &PL_core_reg_engine;
5253         ptr = hv_fetchs(table, "regcomp", FALSE);
5254         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5255             return &PL_core_reg_engine;
5256         return INT2PTR(regexp_engine*,SvIV(*ptr));
5257     }
5258     else {
5259         SV *ptr;
5260         if (!PL_curcop->cop_hints_hash)
5261             return &PL_core_reg_engine;
5262         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5263         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5264             return &PL_core_reg_engine;
5265         return INT2PTR(regexp_engine*,SvIV(ptr));
5266     }
5267 }
5268
5269
5270 REGEXP *
5271 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5272 {
5273     dVAR;
5274     regexp_engine const *eng = current_re_engine();
5275     GET_RE_DEBUG_FLAGS_DECL;
5276
5277     PERL_ARGS_ASSERT_PREGCOMP;
5278
5279     /* Dispatch a request to compile a regexp to correct regexp engine. */
5280     DEBUG_COMPILE_r({
5281         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5282                         PTR2UV(eng));
5283     });
5284     return CALLREGCOMP_ENG(eng, pattern, flags);
5285 }
5286 #endif
5287
5288 /* public(ish) entry point for the perl core's own regex compiling code.
5289  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5290  * pattern rather than a list of OPs, and uses the internal engine rather
5291  * than the current one */
5292
5293 REGEXP *
5294 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5295 {
5296     SV *pat = pattern; /* defeat constness! */
5297     PERL_ARGS_ASSERT_RE_COMPILE;
5298     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5299 #ifdef PERL_IN_XSUB_RE
5300                                 &my_reg_engine,
5301 #else
5302                                 &PL_core_reg_engine,
5303 #endif
5304                                 NULL, NULL, rx_flags, 0);
5305 }
5306
5307
5308 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5309  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5310  * point to the realloced string and length.
5311  *
5312  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5313  * stuff added */
5314
5315 static void
5316 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5317                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5318 {
5319     U8 *const src = (U8*)*pat_p;
5320     U8 *dst;
5321     int n=0;
5322     STRLEN s = 0, d = 0;
5323     bool do_end = 0;
5324     GET_RE_DEBUG_FLAGS_DECL;
5325
5326     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5327         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5328
5329     Newx(dst, *plen_p * 2 + 1, U8);
5330
5331     while (s < *plen_p) {
5332         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5333             dst[d]   = src[s];
5334         else {
5335             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5336             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5337         }
5338         if (n < num_code_blocks) {
5339             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5340                 pRExC_state->code_blocks[n].start = d;
5341                 assert(dst[d] == '(');
5342                 do_end = 1;
5343             }
5344             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5345                 pRExC_state->code_blocks[n].end = d;
5346                 assert(dst[d] == ')');
5347                 do_end = 0;
5348                 n++;
5349             }
5350         }
5351         s++;
5352         d++;
5353     }
5354     dst[d] = '\0';
5355     *plen_p = d;
5356     *pat_p = (char*) dst;
5357     SAVEFREEPV(*pat_p);
5358     RExC_orig_utf8 = RExC_utf8 = 1;
5359 }
5360
5361
5362
5363 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5364  * while recording any code block indices, and handling overloading,
5365  * nested qr// objects etc.  If pat is null, it will allocate a new
5366  * string, or just return the first arg, if there's only one.
5367  *
5368  * Returns the malloced/updated pat.
5369  * patternp and pat_count is the array of SVs to be concatted;
5370  * oplist is the optional list of ops that generated the SVs;
5371  * recompile_p is a pointer to a boolean that will be set if
5372  *   the regex will need to be recompiled.
5373  * delim, if non-null is an SV that will be inserted between each element
5374  */
5375
5376 static SV*
5377 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5378                 SV *pat, SV ** const patternp, int pat_count,
5379                 OP *oplist, bool *recompile_p, SV *delim)
5380 {
5381     SV **svp;
5382     int n = 0;
5383     bool use_delim = FALSE;
5384     bool alloced = FALSE;
5385
5386     /* if we know we have at least two args, create an empty string,
5387      * then concatenate args to that. For no args, return an empty string */
5388     if (!pat && pat_count != 1) {
5389         pat = newSVpvn("", 0);
5390         SAVEFREESV(pat);
5391         alloced = TRUE;
5392     }
5393
5394     for (svp = patternp; svp < patternp + pat_count; svp++) {
5395         SV *sv;
5396         SV *rx  = NULL;
5397         STRLEN orig_patlen = 0;
5398         bool code = 0;
5399         SV *msv = use_delim ? delim : *svp;
5400         if (!msv) msv = &PL_sv_undef;
5401
5402         /* if we've got a delimiter, we go round the loop twice for each
5403          * svp slot (except the last), using the delimiter the second
5404          * time round */
5405         if (use_delim) {
5406             svp--;
5407             use_delim = FALSE;
5408         }
5409         else if (delim)
5410             use_delim = TRUE;
5411
5412         if (SvTYPE(msv) == SVt_PVAV) {
5413             /* we've encountered an interpolated array within
5414              * the pattern, e.g. /...@a..../. Expand the list of elements,
5415              * then recursively append elements.
5416              * The code in this block is based on S_pushav() */
5417
5418             AV *const av = (AV*)msv;
5419             const SSize_t maxarg = AvFILL(av) + 1;
5420             SV **array;
5421
5422             if (oplist) {
5423                 assert(oplist->op_type == OP_PADAV
5424                     || oplist->op_type == OP_RV2AV);
5425                 oplist = oplist->op_sibling;;
5426             }
5427
5428             if (SvRMAGICAL(av)) {
5429                 SSize_t i;
5430
5431                 Newx(array, maxarg, SV*);
5432                 SAVEFREEPV(array);
5433                 for (i=0; i < maxarg; i++) {
5434                     SV ** const svp = av_fetch(av, i, FALSE);
5435                     array[i] = svp ? *svp : &PL_sv_undef;
5436                 }
5437             }
5438             else
5439                 array = AvARRAY(av);
5440
5441             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5442                                 array, maxarg, NULL, recompile_p,
5443                                 /* $" */
5444                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5445
5446             continue;
5447         }
5448
5449
5450         /* we make the assumption here that each op in the list of
5451          * op_siblings maps to one SV pushed onto the stack,
5452          * except for code blocks, with have both an OP_NULL and
5453          * and OP_CONST.
5454          * This allows us to match up the list of SVs against the
5455          * list of OPs to find the next code block.
5456          *
5457          * Note that       PUSHMARK PADSV PADSV ..
5458          * is optimised to
5459          *                 PADRANGE PADSV  PADSV  ..
5460          * so the alignment still works. */
5461
5462         if (oplist) {
5463             if (oplist->op_type == OP_NULL
5464                 && (oplist->op_flags & OPf_SPECIAL))
5465             {
5466                 assert(n < pRExC_state->num_code_blocks);
5467                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5468                 pRExC_state->code_blocks[n].block = oplist;
5469                 pRExC_state->code_blocks[n].src_regex = NULL;
5470                 n++;
5471                 code = 1;
5472                 oplist = oplist->op_sibling; /* skip CONST */
5473                 assert(oplist);
5474             }
5475             oplist = oplist->op_sibling;;
5476         }
5477
5478         /* apply magic and QR overloading to arg */
5479
5480         SvGETMAGIC(msv);
5481         if (SvROK(msv) && SvAMAGIC(msv)) {
5482             SV *sv = AMG_CALLunary(msv, regexp_amg);
5483             if (sv) {
5484                 if (SvROK(sv))
5485                     sv = SvRV(sv);
5486                 if (SvTYPE(sv) != SVt_REGEXP)
5487                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5488                 msv = sv;
5489             }
5490         }
5491
5492         /* try concatenation overload ... */
5493         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5494                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5495         {
5496             sv_setsv(pat, sv);
5497             /* overloading involved: all bets are off over literal
5498              * code. Pretend we haven't seen it */
5499             pRExC_state->num_code_blocks -= n;
5500             n = 0;
5501         }
5502         else  {
5503             /* ... or failing that, try "" overload */
5504             while (SvAMAGIC(msv)
5505                     && (sv = AMG_CALLunary(msv, string_amg))
5506                     && sv != msv
5507                     &&  !(   SvROK(msv)
5508                           && SvROK(sv)
5509                           && SvRV(msv) == SvRV(sv))
5510             ) {
5511                 msv = sv;
5512                 SvGETMAGIC(msv);
5513             }
5514             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5515                 msv = SvRV(msv);
5516
5517             if (pat) {
5518                 /* this is a partially unrolled
5519                  *     sv_catsv_nomg(pat, msv);
5520                  * that allows us to adjust code block indices if
5521                  * needed */
5522                 STRLEN dlen;
5523                 char *dst = SvPV_force_nomg(pat, dlen);
5524                 orig_patlen = dlen;
5525                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5526                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5527                     sv_setpvn(pat, dst, dlen);
5528                     SvUTF8_on(pat);
5529                 }
5530                 sv_catsv_nomg(pat, msv);
5531                 rx = msv;
5532             }
5533             else
5534                 pat = msv;
5535
5536             if (code)
5537                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5538         }
5539
5540         /* extract any code blocks within any embedded qr//'s */
5541         if (rx && SvTYPE(rx) == SVt_REGEXP
5542             && RX_ENGINE((REGEXP*)rx)->op_comp)
5543         {
5544
5545             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5546             if (ri->num_code_blocks) {
5547                 int i;
5548                 /* the presence of an embedded qr// with code means
5549                  * we should always recompile: the text of the
5550                  * qr// may not have changed, but it may be a
5551                  * different closure than last time */
5552                 *recompile_p = 1;
5553                 Renew(pRExC_state->code_blocks,
5554                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5555                     struct reg_code_block);
5556                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5557
5558                 for (i=0; i < ri->num_code_blocks; i++) {
5559                     struct reg_code_block *src, *dst;
5560                     STRLEN offset =  orig_patlen
5561                         + ReANY((REGEXP *)rx)->pre_prefix;
5562                     assert(n < pRExC_state->num_code_blocks);
5563                     src = &ri->code_blocks[i];
5564                     dst = &pRExC_state->code_blocks[n];
5565                     dst->start      = src->start + offset;
5566                     dst->end        = src->end   + offset;
5567                     dst->block      = src->block;
5568                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5569                                             src->src_regex
5570                                                 ? src->src_regex
5571                                                 : (REGEXP*)rx);
5572                     n++;
5573                 }
5574             }
5575         }
5576     }
5577     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5578     if (alloced)
5579         SvSETMAGIC(pat);
5580
5581     return pat;
5582 }
5583
5584
5585
5586 /* see if there are any run-time code blocks in the pattern.
5587  * False positives are allowed */
5588
5589 static bool
5590 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5591                     char *pat, STRLEN plen)
5592 {
5593     int n = 0;
5594     STRLEN s;
5595
5596     for (s = 0; s < plen; s++) {
5597         if (n < pRExC_state->num_code_blocks
5598             && s == pRExC_state->code_blocks[n].start)
5599         {
5600             s = pRExC_state->code_blocks[n].end;
5601             n++;
5602             continue;
5603         }
5604         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5605          * positives here */
5606         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5607             (pat[s+2] == '{'
5608                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5609         )
5610             return 1;
5611     }
5612     return 0;
5613 }
5614
5615 /* Handle run-time code blocks. We will already have compiled any direct
5616  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5617  * copy of it, but with any literal code blocks blanked out and
5618  * appropriate chars escaped; then feed it into
5619  *
5620  *    eval "qr'modified_pattern'"
5621  *
5622  * For example,
5623  *
5624  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5625  *
5626  * becomes
5627  *
5628  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5629  *
5630  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5631  * and merge them with any code blocks of the original regexp.
5632  *
5633  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5634  * instead, just save the qr and return FALSE; this tells our caller that
5635  * the original pattern needs upgrading to utf8.
5636  */
5637
5638 static bool
5639 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5640     char *pat, STRLEN plen)
5641 {
5642     SV *qr;
5643
5644     GET_RE_DEBUG_FLAGS_DECL;
5645
5646     if (pRExC_state->runtime_code_qr) {
5647         /* this is the second time we've been called; this should
5648          * only happen if the main pattern got upgraded to utf8
5649          * during compilation; re-use the qr we compiled first time
5650          * round (which should be utf8 too)
5651          */
5652         qr = pRExC_state->runtime_code_qr;
5653         pRExC_state->runtime_code_qr = NULL;
5654         assert(RExC_utf8 && SvUTF8(qr));
5655     }
5656     else {
5657         int n = 0;
5658         STRLEN s;
5659         char *p, *newpat;
5660         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5661         SV *sv, *qr_ref;
5662         dSP;
5663
5664         /* determine how many extra chars we need for ' and \ escaping */
5665         for (s = 0; s < plen; s++) {
5666             if (pat[s] == '\'' || pat[s] == '\\')
5667                 newlen++;
5668         }
5669
5670         Newx(newpat, newlen, char);
5671         p = newpat;
5672         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5673
5674         for (s = 0; s < plen; s++) {
5675             if (n < pRExC_state->num_code_blocks
5676                 && s == pRExC_state->code_blocks[n].start)
5677             {
5678                 /* blank out literal code block */
5679                 assert(pat[s] == '(');
5680                 while (s <= pRExC_state->code_blocks[n].end) {
5681                     *p++ = '_';
5682                     s++;
5683                 }
5684                 s--;
5685                 n++;
5686                 continue;
5687             }
5688             if (pat[s] == '\'' || pat[s] == '\\')
5689                 *p++ = '\\';
5690             *p++ = pat[s];
5691         }
5692         *p++ = '\'';
5693         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5694             *p++ = 'x';
5695         *p++ = '\0';
5696         DEBUG_COMPILE_r({
5697             PerlIO_printf(Perl_debug_log,
5698                 "%sre-parsing pattern for runtime code:%s %s\n",
5699                 PL_colors[4],PL_colors[5],newpat);
5700         });
5701
5702         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5703         Safefree(newpat);
5704
5705         ENTER;
5706         SAVETMPS;
5707         save_re_context();
5708         PUSHSTACKi(PERLSI_REQUIRE);
5709         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5710          * parsing qr''; normally only q'' does this. It also alters
5711          * hints handling */
5712         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5713         SvREFCNT_dec_NN(sv);
5714         SPAGAIN;
5715         qr_ref = POPs;
5716         PUTBACK;
5717         {
5718             SV * const errsv = ERRSV;
5719             if (SvTRUE_NN(errsv))
5720             {
5721                 Safefree(pRExC_state->code_blocks);
5722                 /* use croak_sv ? */
5723                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5724             }
5725         }
5726         assert(SvROK(qr_ref));
5727         qr = SvRV(qr_ref);
5728         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5729         /* the leaving below frees the tmp qr_ref.
5730          * Give qr a life of its own */
5731         SvREFCNT_inc(qr);
5732         POPSTACK;
5733         FREETMPS;
5734         LEAVE;
5735
5736     }
5737
5738     if (!RExC_utf8 && SvUTF8(qr)) {
5739         /* first time through; the pattern got upgraded; save the
5740          * qr for the next time through */
5741         assert(!pRExC_state->runtime_code_qr);
5742         pRExC_state->runtime_code_qr = qr;
5743         return 0;
5744     }
5745
5746
5747     /* extract any code blocks within the returned qr//  */
5748
5749
5750     /* merge the main (r1) and run-time (r2) code blocks into one */
5751     {
5752         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5753         struct reg_code_block *new_block, *dst;
5754         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5755         int i1 = 0, i2 = 0;
5756
5757         if (!r2->num_code_blocks) /* we guessed wrong */
5758         {
5759             SvREFCNT_dec_NN(qr);
5760             return 1;
5761         }
5762
5763         Newx(new_block,
5764             r1->num_code_blocks + r2->num_code_blocks,
5765             struct reg_code_block);
5766         dst = new_block;
5767
5768         while (    i1 < r1->num_code_blocks
5769                 || i2 < r2->num_code_blocks)
5770         {
5771             struct reg_code_block *src;
5772             bool is_qr = 0;
5773
5774             if (i1 == r1->num_code_blocks) {
5775                 src = &r2->code_blocks[i2++];
5776                 is_qr = 1;
5777             }
5778             else if (i2 == r2->num_code_blocks)
5779                 src = &r1->code_blocks[i1++];
5780             else if (  r1->code_blocks[i1].start
5781                      < r2->code_blocks[i2].start)
5782             {
5783                 src = &r1->code_blocks[i1++];
5784                 assert(src->end < r2->code_blocks[i2].start);
5785             }
5786             else {
5787                 assert(  r1->code_blocks[i1].start
5788                        > r2->code_blocks[i2].start);
5789                 src = &r2->code_blocks[i2++];
5790                 is_qr = 1;
5791                 assert(src->end < r1->code_blocks[i1].start);
5792             }
5793
5794             assert(pat[src->start] == '(');
5795             assert(pat[src->end]   == ')');
5796             dst->start      = src->start;
5797             dst->end        = src->end;
5798             dst->block      = src->block;
5799             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5800                                     : src->src_regex;
5801             dst++;
5802         }
5803         r1->num_code_blocks += r2->num_code_blocks;
5804         Safefree(r1->code_blocks);
5805         r1->code_blocks = new_block;
5806     }
5807
5808     SvREFCNT_dec_NN(qr);
5809     return 1;
5810 }
5811
5812
5813 STATIC bool
5814 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5815                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5816 {
5817     /* This is the common code for setting up the floating and fixed length
5818      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5819      * as to whether succeeded or not */
5820
5821     I32 t;
5822     SSize_t ml;
5823
5824     if (! (longest_length
5825            || (eol /* Can't have SEOL and MULTI */
5826                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5827           )
5828             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5829         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5830     {
5831         return FALSE;
5832     }
5833
5834     /* copy the information about the longest from the reg_scan_data
5835         over to the program. */
5836     if (SvUTF8(sv_longest)) {
5837         *rx_utf8 = sv_longest;
5838         *rx_substr = NULL;
5839     } else {
5840         *rx_substr = sv_longest;
5841         *rx_utf8 = NULL;
5842     }
5843     /* end_shift is how many chars that must be matched that
5844         follow this item. We calculate it ahead of time as once the
5845         lookbehind offset is added in we lose the ability to correctly
5846         calculate it.*/
5847     ml = minlen ? *(minlen) : (SSize_t)longest_length;
5848     *rx_end_shift = ml - offset
5849         - longest_length + (SvTAIL(sv_longest) != 0)
5850         + lookbehind;
5851
5852     t = (eol/* Can't have SEOL and MULTI */
5853          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5854     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5855
5856     return TRUE;
5857 }
5858
5859 /*
5860  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5861  * regular expression into internal code.
5862  * The pattern may be passed either as:
5863  *    a list of SVs (patternp plus pat_count)
5864  *    a list of OPs (expr)
5865  * If both are passed, the SV list is used, but the OP list indicates
5866  * which SVs are actually pre-compiled code blocks
5867  *
5868  * The SVs in the list have magic and qr overloading applied to them (and
5869  * the list may be modified in-place with replacement SVs in the latter
5870  * case).
5871  *
5872  * If the pattern hasn't changed from old_re, then old_re will be
5873  * returned.
5874  *
5875  * eng is the current engine. If that engine has an op_comp method, then
5876  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5877  * do the initial concatenation of arguments and pass on to the external
5878  * engine.
5879  *
5880  * If is_bare_re is not null, set it to a boolean indicating whether the
5881  * arg list reduced (after overloading) to a single bare regex which has
5882  * been returned (i.e. /$qr/).
5883  *
5884  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5885  *
5886  * pm_flags contains the PMf_* flags, typically based on those from the
5887  * pm_flags field of the related PMOP. Currently we're only interested in
5888  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5889  *
5890  * We can't allocate space until we know how big the compiled form will be,
5891  * but we can't compile it (and thus know how big it is) until we've got a
5892  * place to put the code.  So we cheat:  we compile it twice, once with code
5893  * generation turned off and size counting turned on, and once "for real".
5894  * This also means that we don't allocate space until we are sure that the
5895  * thing really will compile successfully, and we never have to move the
5896  * code and thus invalidate pointers into it.  (Note that it has to be in
5897  * one piece because free() must be able to free it all.) [NB: not true in perl]
5898  *
5899  * Beware that the optimization-preparation code in here knows about some
5900  * of the structure of the compiled regexp.  [I'll say.]
5901  */
5902
5903 REGEXP *
5904 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5905                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5906                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5907 {
5908     dVAR;
5909     REGEXP *rx;
5910     struct regexp *r;
5911     regexp_internal *ri;
5912     STRLEN plen;
5913     char *exp;
5914     regnode *scan;
5915     I32 flags;
5916     SSize_t minlen = 0;
5917     U32 rx_flags;
5918     SV *pat;
5919     SV *code_blocksv = NULL;
5920     SV** new_patternp = patternp;
5921
5922     /* these are all flags - maybe they should be turned
5923      * into a single int with different bit masks */
5924     I32 sawlookahead = 0;
5925     I32 sawplus = 0;
5926     I32 sawopen = 0;
5927     I32 sawminmod = 0;
5928
5929     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5930     bool recompile = 0;
5931     bool runtime_code = 0;
5932     scan_data_t data;
5933     RExC_state_t RExC_state;
5934     RExC_state_t * const pRExC_state = &RExC_state;
5935 #ifdef TRIE_STUDY_OPT
5936     int restudied = 0;
5937     RExC_state_t copyRExC_state;
5938 #endif
5939     GET_RE_DEBUG_FLAGS_DECL;
5940
5941     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5942
5943     DEBUG_r(if (!PL_colorset) reginitcolors());
5944
5945 #ifndef PERL_IN_XSUB_RE
5946     /* Initialize these here instead of as-needed, as is quick and avoids
5947      * having to test them each time otherwise */
5948     if (! PL_AboveLatin1) {
5949         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5950         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5951         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5952
5953         PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5954         PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5955         PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5956
5957         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5958                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5959         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5960                                 = _new_invlist_C_array(PosixAlnum_invlist);
5961
5962         PL_L1Posix_ptrs[_CC_ALPHA]
5963                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5964         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5965
5966         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5967         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5968
5969         /* Cased is the same as Alpha in the ASCII range */
5970         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5971         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5972
5973         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5974         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5975
5976         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5977         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5978
5979         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5980         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5981
5982         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5983         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5984
5985         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5986         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5987
5988         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5989         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5990
5991         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5992         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5993         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5994         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5995
5996         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5997         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5998
5999         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6000
6001         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6002         PL_L1Posix_ptrs[_CC_WORDCHAR]
6003                                 = _new_invlist_C_array(L1PosixWord_invlist);
6004
6005         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6006         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6007
6008         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6009     }
6010 #endif
6011
6012     pRExC_state->code_blocks = NULL;
6013     pRExC_state->num_code_blocks = 0;
6014
6015     if (is_bare_re)
6016         *is_bare_re = FALSE;
6017
6018     if (expr && (expr->op_type == OP_LIST ||
6019                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6020         /* allocate code_blocks if needed */
6021         OP *o;
6022         int ncode = 0;
6023
6024         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6025             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6026                 ncode++; /* count of DO blocks */
6027         if (ncode) {
6028             pRExC_state->num_code_blocks = ncode;
6029             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6030         }
6031     }
6032
6033     if (!pat_count) {
6034         /* compile-time pattern with just OP_CONSTs and DO blocks */
6035
6036         int n;
6037         OP *o;
6038
6039         /* find how many CONSTs there are */
6040         assert(expr);
6041         n = 0;
6042         if (expr->op_type == OP_CONST)
6043             n = 1;
6044         else
6045             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6046                 if (o->op_type == OP_CONST)
6047                     n++;
6048             }
6049
6050         /* fake up an SV array */
6051
6052         assert(!new_patternp);
6053         Newx(new_patternp, n, SV*);
6054         SAVEFREEPV(new_patternp);
6055         pat_count = n;
6056
6057         n = 0;
6058         if (expr->op_type == OP_CONST)
6059             new_patternp[n] = cSVOPx_sv(expr);
6060         else
6061             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6062                 if (o->op_type == OP_CONST)
6063                     new_patternp[n++] = cSVOPo_sv;
6064             }
6065
6066     }
6067
6068     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6069         "Assembling pattern from %d elements%s\n", pat_count,
6070             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6071
6072     /* set expr to the first arg op */
6073
6074     if (pRExC_state->num_code_blocks
6075          && expr->op_type != OP_CONST)
6076     {
6077             expr = cLISTOPx(expr)->op_first;
6078             assert(   expr->op_type == OP_PUSHMARK
6079                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6080                    || expr->op_type == OP_PADRANGE);
6081             expr = expr->op_sibling;
6082     }
6083
6084     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6085                         expr, &recompile, NULL);
6086
6087     /* handle bare (possibly after overloading) regex: foo =~ $re */
6088     {
6089         SV *re = pat;
6090         if (SvROK(re))
6091             re = SvRV(re);
6092         if (SvTYPE(re) == SVt_REGEXP) {
6093             if (is_bare_re)
6094                 *is_bare_re = TRUE;
6095             SvREFCNT_inc(re);
6096             Safefree(pRExC_state->code_blocks);
6097             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6098                 "Precompiled pattern%s\n",
6099                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6100
6101             return (REGEXP*)re;
6102         }
6103     }
6104
6105     exp = SvPV_nomg(pat, plen);
6106
6107     if (!eng->op_comp) {
6108         if ((SvUTF8(pat) && IN_BYTES)
6109                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6110         {
6111             /* make a temporary copy; either to convert to bytes,
6112              * or to avoid repeating get-magic / overloaded stringify */
6113             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6114                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6115         }
6116         Safefree(pRExC_state->code_blocks);
6117         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6118     }
6119
6120     /* ignore the utf8ness if the pattern is 0 length */
6121     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6122     RExC_uni_semantics = 0;
6123     RExC_contains_locale = 0;
6124     RExC_contains_i = 0;
6125     pRExC_state->runtime_code_qr = NULL;
6126
6127     DEBUG_COMPILE_r({
6128             SV *dsv= sv_newmortal();
6129             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6130             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6131                           PL_colors[4],PL_colors[5],s);
6132         });
6133
6134   redo_first_pass:
6135     /* we jump here if we upgrade the pattern to utf8 and have to
6136      * recompile */
6137
6138     if ((pm_flags & PMf_USE_RE_EVAL)
6139                 /* this second condition covers the non-regex literal case,
6140                  * i.e.  $foo =~ '(?{})'. */
6141                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6142     )
6143         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6144
6145     /* return old regex if pattern hasn't changed */
6146     /* XXX: note in the below we have to check the flags as well as the pattern.
6147      *
6148      * Things get a touch tricky as we have to compare the utf8 flag independently
6149      * from the compile flags.
6150      */
6151
6152     if (   old_re
6153         && !recompile
6154         && !!RX_UTF8(old_re) == !!RExC_utf8
6155         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6156         && RX_PRECOMP(old_re)
6157         && RX_PRELEN(old_re) == plen
6158         && memEQ(RX_PRECOMP(old_re), exp, plen)
6159         && !runtime_code /* with runtime code, always recompile */ )
6160     {
6161         Safefree(pRExC_state->code_blocks);
6162         return old_re;
6163     }
6164
6165     rx_flags = orig_rx_flags;
6166
6167     if (rx_flags & PMf_FOLD) {
6168         RExC_contains_i = 1;
6169     }
6170     if (initial_charset == REGEX_LOCALE_CHARSET) {
6171         RExC_contains_locale = 1;
6172     }
6173     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6174
6175         /* Set to use unicode semantics if the pattern is in utf8 and has the
6176          * 'depends' charset specified, as it means unicode when utf8  */
6177         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6178     }
6179
6180     RExC_precomp = exp;
6181     RExC_flags = rx_flags;
6182     RExC_pm_flags = pm_flags;
6183
6184     if (runtime_code) {
6185         if (TAINTING_get && TAINT_get)
6186             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6187
6188         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6189             /* whoops, we have a non-utf8 pattern, whilst run-time code
6190              * got compiled as utf8. Try again with a utf8 pattern */
6191             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6192                                     pRExC_state->num_code_blocks);
6193             goto redo_first_pass;
6194         }
6195     }
6196     assert(!pRExC_state->runtime_code_qr);
6197
6198     RExC_sawback = 0;
6199
6200     RExC_seen = 0;
6201     RExC_in_lookbehind = 0;
6202     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6203     RExC_extralen = 0;
6204     RExC_override_recoding = 0;
6205     RExC_in_multi_char_class = 0;
6206
6207     /* First pass: determine size, legality. */
6208     RExC_parse = exp;
6209     RExC_start = exp;
6210     RExC_end = exp + plen;
6211     RExC_naughty = 0;
6212     RExC_npar = 1;
6213     RExC_nestroot = 0;
6214     RExC_size = 0L;
6215     RExC_emit = (regnode *) &RExC_emit_dummy;
6216     RExC_whilem_seen = 0;
6217     RExC_open_parens = NULL;
6218     RExC_close_parens = NULL;
6219     RExC_opend = NULL;
6220     RExC_paren_names = NULL;
6221 #ifdef DEBUGGING
6222     RExC_paren_name_list = NULL;
6223 #endif
6224     RExC_recurse = NULL;
6225     RExC_study_chunk_recursed = NULL;
6226     RExC_study_chunk_recursed_bytes= 0;
6227     RExC_recurse_count = 0;
6228     pRExC_state->code_index = 0;
6229
6230 #if 0 /* REGC() is (currently) a NOP at the first pass.
6231        * Clever compilers notice this and complain. --jhi */
6232     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6233 #endif
6234     DEBUG_PARSE_r(
6235         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6236         RExC_lastnum=0;
6237         RExC_lastparse=NULL;
6238     );
6239     /* reg may croak on us, not giving us a chance to free
6240        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6241        need it to survive as long as the regexp (qr/(?{})/).
6242        We must check that code_blocksv is not already set, because we may
6243        have jumped back to restart the sizing pass. */
6244     if (pRExC_state->code_blocks && !code_blocksv) {
6245         code_blocksv = newSV_type(SVt_PV);
6246         SAVEFREESV(code_blocksv);
6247         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6248         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6249     }
6250     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6251         /* It's possible to write a regexp in ascii that represents Unicode
6252         codepoints outside of the byte range, such as via \x{100}. If we
6253         detect such a sequence we have to convert the entire pattern to utf8
6254         and then recompile, as our sizing calculation will have been based
6255         on 1 byte == 1 character, but we will need to use utf8 to encode
6256         at least some part of the pattern, and therefore must convert the whole
6257         thing.
6258         -- dmq */
6259         if (flags & RESTART_UTF8) {
6260             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6261                                     pRExC_state->num_code_blocks);
6262             goto redo_first_pass;
6263         }
6264         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6265     }
6266     if (code_blocksv)
6267         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6268
6269     DEBUG_PARSE_r({
6270         PerlIO_printf(Perl_debug_log,
6271             "Required size %"IVdf" nodes\n"
6272             "Starting second pass (creation)\n",
6273             (IV)RExC_size);
6274         RExC_lastnum=0;
6275         RExC_lastparse=NULL;
6276     });
6277
6278     /* The first pass could have found things that force Unicode semantics */
6279     if ((RExC_utf8 || RExC_uni_semantics)
6280          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6281     {
6282         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6283     }
6284
6285     /* Small enough for pointer-storage convention?
6286        If extralen==0, this means that we will not need long jumps. */
6287     if (RExC_size >= 0x10000L && RExC_extralen)
6288         RExC_size += RExC_extralen;
6289     else
6290         RExC_extralen = 0;
6291     if (RExC_whilem_seen > 15)
6292         RExC_whilem_seen = 15;
6293
6294     /* Allocate space and zero-initialize. Note, the two step process
6295        of zeroing when in debug mode, thus anything assigned has to
6296        happen after that */
6297     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6298     r = ReANY(rx);
6299     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6300          char, regexp_internal);
6301     if ( r == NULL || ri == NULL )
6302         FAIL("Regexp out of space");
6303 #ifdef DEBUGGING
6304     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6305     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6306 #else
6307     /* bulk initialize base fields with 0. */
6308     Zero(ri, sizeof(regexp_internal), char);
6309 #endif
6310
6311     /* non-zero initialization begins here */
6312     RXi_SET( r, ri );
6313     r->engine= eng;
6314     r->extflags = rx_flags;
6315     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6316
6317     if (pm_flags & PMf_IS_QR) {
6318         ri->code_blocks = pRExC_state->code_blocks;
6319         ri->num_code_blocks = pRExC_state->num_code_blocks;
6320     }
6321     else
6322     {
6323         int n;
6324         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6325             if (pRExC_state->code_blocks[n].src_regex)
6326                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6327         SAVEFREEPV(pRExC_state->code_blocks);
6328     }
6329
6330     {
6331         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6332         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6333
6334         /* The caret is output if there are any defaults: if not all the STD
6335          * flags are set, or if no character set specifier is needed */
6336         bool has_default =
6337                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6338                     || ! has_charset);
6339         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6340         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6341                             >> RXf_PMf_STD_PMMOD_SHIFT);
6342         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6343         char *p;
6344         /* Allocate for the worst case, which is all the std flags are turned
6345          * on.  If more precision is desired, we could do a population count of
6346          * the flags set.  This could be done with a small lookup table, or by
6347          * shifting, masking and adding, or even, when available, assembly
6348          * language for a machine-language population count.
6349          * We never output a minus, as all those are defaults, so are
6350          * covered by the caret */
6351         const STRLEN wraplen = plen + has_p + has_runon
6352             + has_default       /* If needs a caret */
6353
6354                 /* If needs a character set specifier */
6355             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6356             + (sizeof(STD_PAT_MODS) - 1)
6357             + (sizeof("(?:)") - 1);
6358
6359         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6360         r->xpv_len_u.xpvlenu_pv = p;
6361         if (RExC_utf8)
6362             SvFLAGS(rx) |= SVf_UTF8;
6363         *p++='('; *p++='?';
6364
6365         /* If a default, cover it using the caret */
6366         if (has_default) {
6367             *p++= DEFAULT_PAT_MOD;
6368         }
6369         if (has_charset) {
6370             STRLEN len;
6371             const char* const name = get_regex_charset_name(r->extflags, &len);
6372             Copy(name, p, len, char);
6373             p += len;
6374         }
6375         if (has_p)
6376             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6377         {
6378             char ch;
6379             while((ch = *fptr++)) {
6380                 if(reganch & 1)
6381                     *p++ = ch;
6382                 reganch >>= 1;
6383             }
6384         }
6385
6386         *p++ = ':';
6387         Copy(RExC_precomp, p, plen, char);
6388         assert ((RX_WRAPPED(rx) - p) < 16);
6389         r->pre_prefix = p - RX_WRAPPED(rx);
6390         p += plen;
6391         if (has_runon)
6392             *p++ = '\n';
6393         *p++ = ')';
6394         *p = 0;
6395         SvCUR_set(rx, p - RX_WRAPPED(rx));
6396     }
6397
6398     r->intflags = 0;
6399     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6400
6401     /* setup various meta data about recursion, this all requires
6402      * RExC_npar to be correctly set, and a bit later on we clear it */
6403     if (RExC_seen & REG_SEEN_RECURSE) {
6404         Newxz(RExC_open_parens, RExC_npar,regnode *);
6405         SAVEFREEPV(RExC_open_parens);
6406         Newxz(RExC_close_parens,RExC_npar,regnode *);
6407         SAVEFREEPV(RExC_close_parens);
6408     }
6409     if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6410         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6411          * So its 1 if there are no parens. */
6412         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6413                                          ((RExC_npar & 0x07) != 0);
6414         Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6415         SAVEFREEPV(RExC_study_chunk_recursed);
6416     }
6417
6418     /* Useful during FAIL. */
6419 #ifdef RE_TRACK_PATTERN_OFFSETS
6420     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6421     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6422                           "%s %"UVuf" bytes for offset annotations.\n",
6423                           ri->u.offsets ? "Got" : "Couldn't get",
6424                           (UV)((2*RExC_size+1) * sizeof(U32))));
6425 #endif
6426     SetProgLen(ri,RExC_size);
6427     RExC_rx_sv = rx;
6428     RExC_rx = r;
6429     RExC_rxi = ri;
6430
6431     /* Second pass: emit code. */
6432     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6433     RExC_pm_flags = pm_flags;
6434     RExC_parse = exp;
6435     RExC_end = exp + plen;
6436     RExC_naughty = 0;
6437     RExC_npar = 1;
6438     RExC_emit_start = ri->program;
6439     RExC_emit = ri->program;
6440     RExC_emit_bound = ri->program + RExC_size + 1;
6441     pRExC_state->code_index = 0;
6442
6443     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6444     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6445         ReREFCNT_dec(rx);
6446         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6447     }
6448     /* XXXX To minimize changes to RE engine we always allocate
6449        3-units-long substrs field. */
6450     Newx(r->substrs, 1, struct reg_substr_data);
6451     if (RExC_recurse_count) {
6452         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6453         SAVEFREEPV(RExC_recurse);
6454     }
6455
6456 reStudy:
6457     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6458     Zero(r->substrs, 1, struct reg_substr_data);
6459     if (RExC_study_chunk_recursed)
6460         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6461
6462 #ifdef TRIE_STUDY_OPT
6463     if (!restudied) {
6464         StructCopy(&zero_scan_data, &data, scan_data_t);
6465         copyRExC_state = RExC_state;
6466     } else {
6467         U32 seen=RExC_seen;
6468         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6469
6470         RExC_state = copyRExC_state;
6471         if (seen & REG_TOP_LEVEL_BRANCHES)
6472             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6473         else
6474             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6475         StructCopy(&zero_scan_data, &data, scan_data_t);
6476     }
6477 #else
6478     StructCopy(&zero_scan_data, &data, scan_data_t);
6479 #endif
6480
6481     /* Dig out information for optimizations. */
6482     r->extflags = RExC_flags; /* was pm_op */
6483     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6484
6485     if (UTF)
6486         SvUTF8_on(rx);  /* Unicode in it? */
6487     ri->regstclass = NULL;
6488     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6489         r->intflags |= PREGf_NAUGHTY;
6490     scan = ri->program + 1;             /* First BRANCH. */
6491
6492     /* testing for BRANCH here tells us whether there is "must appear"
6493        data in the pattern. If there is then we can use it for optimisations */
6494     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6495         SSize_t fake;
6496         STRLEN longest_float_length, longest_fixed_length;
6497         regnode_ssc ch_class; /* pointed to by data */
6498         int stclass_flag;
6499         SSize_t last_close = 0; /* pointed to by data */
6500         regnode *first= scan;
6501         regnode *first_next= regnext(first);
6502         /*
6503          * Skip introductions and multiplicators >= 1
6504          * so that we can extract the 'meat' of the pattern that must
6505          * match in the large if() sequence following.
6506          * NOTE that EXACT is NOT covered here, as it is normally
6507          * picked up by the optimiser separately.
6508          *
6509          * This is unfortunate as the optimiser isnt handling lookahead
6510          * properly currently.
6511          *
6512          */
6513         while ((OP(first) == OPEN && (sawopen = 1)) ||
6514                /* An OR of *one* alternative - should not happen now. */
6515             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6516             /* for now we can't handle lookbehind IFMATCH*/
6517             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6518             (OP(first) == PLUS) ||
6519             (OP(first) == MINMOD) ||
6520                /* An {n,m} with n>0 */
6521             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6522             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6523         {
6524                 /*
6525                  * the only op that could be a regnode is PLUS, all the rest
6526                  * will be regnode_1 or regnode_2.
6527                  *
6528                  * (yves doesn't think this is true)
6529                  */
6530                 if (OP(first) == PLUS)
6531                     sawplus = 1;
6532                 else {
6533                     if (OP(first) == MINMOD)
6534                         sawminmod = 1;
6535                     first += regarglen[OP(first)];
6536                 }
6537                 first = NEXTOPER(first);
6538                 first_next= regnext(first);
6539         }
6540
6541         /* Starting-point info. */
6542       again:
6543         DEBUG_PEEP("first:",first,0);
6544         /* Ignore EXACT as we deal with it later. */
6545         if (PL_regkind[OP(first)] == EXACT) {
6546             if (OP(first) == EXACT)
6547                 NOOP;   /* Empty, get anchored substr later. */
6548             else
6549                 ri->regstclass = first;
6550         }
6551 #ifdef TRIE_STCLASS
6552         else if (PL_regkind[OP(first)] == TRIE &&
6553                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6554         {
6555             regnode *trie_op;
6556             /* this can happen only on restudy */
6557             if ( OP(first) == TRIE ) {
6558                 struct regnode_1 *trieop = (struct regnode_1 *)
6559                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6560                 StructCopy(first,trieop,struct regnode_1);
6561                 trie_op=(regnode *)trieop;
6562             } else {
6563                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6564                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6565                 StructCopy(first,trieop,struct regnode_charclass);
6566                 trie_op=(regnode *)trieop;
6567             }
6568             OP(trie_op)+=2;
6569             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6570             ri->regstclass = trie_op;
6571         }
6572 #endif
6573         else if (REGNODE_SIMPLE(OP(first)))
6574             ri->regstclass = first;
6575         else if (PL_regkind[OP(first)] == BOUND ||
6576                  PL_regkind[OP(first)] == NBOUND)
6577             ri->regstclass = first;
6578         else if (PL_regkind[OP(first)] == BOL) {
6579             r->extflags |= (OP(first) == MBOL
6580                            ? RXf_ANCH_MBOL
6581                            : (OP(first) == SBOL
6582                               ? RXf_ANCH_SBOL
6583                               : RXf_ANCH_BOL));
6584             first = NEXTOPER(first);
6585             goto again;
6586         }
6587         else if (OP(first) == GPOS) {
6588             r->extflags |= RXf_ANCH_GPOS;
6589             first = NEXTOPER(first);
6590             goto again;
6591         }
6592         else if ((!sawopen || !RExC_sawback) &&
6593             (OP(first) == STAR &&
6594             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6595             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6596         {
6597             /* turn .* into ^.* with an implied $*=1 */
6598             const int type =
6599                 (OP(NEXTOPER(first)) == REG_ANY)
6600                     ? RXf_ANCH_MBOL
6601                     : RXf_ANCH_SBOL;
6602             r->extflags |= type;
6603             r->intflags |= PREGf_IMPLICIT;
6604             first = NEXTOPER(first);
6605             goto again;
6606         }
6607         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6608             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6609             /* x+ must match at the 1st pos of run of x's */
6610             r->intflags |= PREGf_SKIP;
6611
6612         /* Scan is after the zeroth branch, first is atomic matcher. */
6613 #ifdef TRIE_STUDY_OPT
6614         DEBUG_PARSE_r(
6615             if (!restudied)
6616                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6617                               (IV)(first - scan + 1))
6618         );
6619 #else
6620         DEBUG_PARSE_r(
6621             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6622                 (IV)(first - scan + 1))
6623         );
6624 #endif
6625
6626
6627         /*
6628         * If there's something expensive in the r.e., find the
6629         * longest literal string that must appear and make it the
6630         * regmust.  Resolve ties in favor of later strings, since
6631         * the regstart check works with the beginning of the r.e.
6632         * and avoiding duplication strengthens checking.  Not a
6633         * strong reason, but sufficient in the absence of others.
6634         * [Now we resolve ties in favor of the earlier string if
6635         * it happens that c_offset_min has been invalidated, since the
6636         * earlier string may buy us something the later one won't.]
6637         */
6638
6639         data.longest_fixed = newSVpvs("");
6640         data.longest_float = newSVpvs("");
6641         data.last_found = newSVpvs("");
6642         data.longest = &(data.longest_fixed);
6643         ENTER_with_name("study_chunk");
6644         SAVEFREESV(data.longest_fixed);
6645         SAVEFREESV(data.longest_float);
6646         SAVEFREESV(data.last_found);
6647         first = scan;
6648         if (!ri->regstclass) {
6649             ssc_init(pRExC_state, &ch_class);
6650             data.start_class = &ch_class;
6651             stclass_flag = SCF_DO_STCLASS_AND;
6652         } else                          /* XXXX Check for BOUND? */
6653             stclass_flag = 0;
6654         data.last_closep = &last_close;
6655
6656         DEBUG_RExC_seen();
6657         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6658             &data, -1, 0, NULL,
6659             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6660                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6661             0);
6662
6663
6664         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6665
6666
6667         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6668              && data.last_start_min == 0 && data.last_end > 0
6669              && !RExC_seen_zerolen
6670              && !(RExC_seen & REG_SEEN_VERBARG)
6671              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6672             r->extflags |= RXf_CHECK_ALL;
6673         scan_commit(pRExC_state, &data,&minlen,0);
6674
6675         longest_float_length = CHR_SVLEN(data.longest_float);
6676
6677         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6678                    && data.offset_fixed == data.offset_float_min
6679                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6680             && S_setup_longest (aTHX_ pRExC_state,
6681                                     data.longest_float,
6682                                     &(r->float_utf8),
6683                                     &(r->float_substr),
6684                                     &(r->float_end_shift),
6685                                     data.lookbehind_float,
6686                                     data.offset_float_min,
6687                                     data.minlen_float,
6688                                     longest_float_length,
6689                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6690                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6691         {
6692             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6693             r->float_max_offset = data.offset_float_max;
6694             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6695                 r->float_max_offset -= data.lookbehind_float;
6696             SvREFCNT_inc_simple_void_NN(data.longest_float);
6697         }
6698         else {
6699             r->float_substr = r->float_utf8 = NULL;
6700             longest_float_length = 0;
6701         }
6702
6703         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6704
6705         if (S_setup_longest (aTHX_ pRExC_state,
6706                                 data.longest_fixed,
6707                                 &(r->anchored_utf8),
6708                                 &(r->anchored_substr),
6709                                 &(r->anchored_end_shift),
6710                                 data.lookbehind_fixed,
6711                                 data.offset_fixed,
6712                                 data.minlen_fixed,
6713                                 longest_fixed_length,
6714                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6715                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6716         {
6717             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6718             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6719         }
6720         else {
6721             r->anchored_substr = r->anchored_utf8 = NULL;
6722             longest_fixed_length = 0;
6723         }
6724         LEAVE_with_name("study_chunk");
6725
6726         if (ri->regstclass
6727             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6728             ri->regstclass = NULL;
6729
6730         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6731             && stclass_flag
6732             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6733             && !ssc_is_anything(data.start_class))
6734         {
6735             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6736
6737             ssc_finalize(pRExC_state, data.start_class);
6738
6739             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6740             StructCopy(data.start_class,
6741                        (regnode_ssc*)RExC_rxi->data->data[n],
6742                        regnode_ssc);
6743             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6744             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6745             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6746                       regprop(r, sv, (regnode*)data.start_class);
6747                       PerlIO_printf(Perl_debug_log,
6748                                     "synthetic stclass \"%s\".\n",
6749                                     SvPVX_const(sv));});
6750             data.start_class = NULL;
6751         }
6752
6753         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6754         if (longest_fixed_length > longest_float_length) {
6755             r->check_end_shift = r->anchored_end_shift;
6756             r->check_substr = r->anchored_substr;
6757             r->check_utf8 = r->anchored_utf8;
6758             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6759             if (r->extflags & RXf_ANCH_SINGLE)
6760                 r->extflags |= RXf_NOSCAN;
6761         }
6762         else {
6763             r->check_end_shift = r->float_end_shift;
6764             r->check_substr = r->float_substr;
6765             r->check_utf8 = r->float_utf8;
6766             r->check_offset_min = r->float_min_offset;
6767             r->check_offset_max = r->float_max_offset;
6768         }
6769         if ((r->check_substr || r->check_utf8) ) {
6770             r->extflags |= RXf_USE_INTUIT;
6771             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6772                 r->extflags |= RXf_INTUIT_TAIL;
6773         }
6774         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6775         if ( (STRLEN)minlen < longest_float_length )
6776             minlen= longest_float_length;
6777         if ( (STRLEN)minlen < longest_fixed_length )
6778             minlen= longest_fixed_length;
6779         */
6780     }
6781     else {
6782         /* Several toplevels. Best we can is to set minlen. */
6783         SSize_t fake;
6784         regnode_ssc ch_class;
6785         SSize_t last_close = 0;
6786
6787         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6788
6789         scan = ri->program + 1;
6790         ssc_init(pRExC_state, &ch_class);
6791         data.start_class = &ch_class;
6792         data.last_closep = &last_close;
6793
6794         DEBUG_RExC_seen();
6795         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6796             &data, -1, 0, NULL,
6797             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6798                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6799             0);
6800
6801         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6802
6803         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6804                 = r->float_substr = r->float_utf8 = NULL;
6805
6806         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6807             && ! ssc_is_anything(data.start_class))
6808         {
6809             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6810
6811             ssc_finalize(pRExC_state, data.start_class);
6812
6813             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6814             StructCopy(data.start_class,
6815                        (regnode_ssc*)RExC_rxi->data->data[n],
6816                        regnode_ssc);
6817             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6818             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6819             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6820                       regprop(r, sv, (regnode*)data.start_class);
6821                       PerlIO_printf(Perl_debug_log,
6822                                     "synthetic stclass \"%s\".\n",
6823                                     SvPVX_const(sv));});
6824             data.start_class = NULL;
6825         }
6826     }
6827
6828     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6829        the "real" pattern. */
6830     DEBUG_OPTIMISE_r({
6831         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6832                       (IV)minlen, (IV)r->minlen);
6833     });
6834     r->minlenret = minlen;
6835     if (r->minlen < minlen)
6836         r->minlen = minlen;
6837
6838     if (RExC_seen & REG_SEEN_GPOS)
6839         r->extflags |= RXf_GPOS_SEEN;
6840     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6841         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6842     if (pRExC_state->num_code_blocks)
6843         r->extflags |= RXf_EVAL_SEEN;
6844     if (RExC_seen & REG_SEEN_CANY)
6845         r->extflags |= RXf_CANY_SEEN;
6846     if (RExC_seen & REG_SEEN_VERBARG)
6847     {
6848         r->intflags |= PREGf_VERBARG_SEEN;
6849         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6850     }
6851     if (RExC_seen & REG_SEEN_CUTGROUP)
6852         r->intflags |= PREGf_CUTGROUP_SEEN;
6853     if (pm_flags & PMf_USE_RE_EVAL)
6854         r->intflags |= PREGf_USE_RE_EVAL;
6855     if (RExC_paren_names)
6856         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6857     else
6858         RXp_PAREN_NAMES(r) = NULL;
6859
6860     {
6861         regnode *first = ri->program + 1;
6862         U8 fop = OP(first);
6863         regnode *next = NEXTOPER(first);
6864         U8 nop = OP(next);
6865
6866         if (PL_regkind[fop] == NOTHING && nop == END)
6867             r->extflags |= RXf_NULL;
6868         else if (PL_regkind[fop] == BOL && nop == END)
6869             r->extflags |= RXf_START_ONLY;
6870         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6871             r->extflags |= RXf_WHITE;
6872         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6873             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6874
6875     }
6876 #ifdef DEBUGGING
6877     if (RExC_paren_names) {
6878         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6879         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6880     } else
6881 #endif
6882         ri->name_list_idx = 0;
6883
6884     if (RExC_recurse_count) {
6885         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6886             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6887             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6888         }
6889     }
6890     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6891     /* assume we don't need to swap parens around before we match */
6892
6893     DEBUG_DUMP_r({
6894         DEBUG_RExC_seen();
6895         PerlIO_printf(Perl_debug_log,"Final program:\n");
6896         regdump(r);
6897     });
6898 #ifdef RE_TRACK_PATTERN_OFFSETS
6899     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6900         const STRLEN len = ri->u.offsets[0];
6901         STRLEN i;
6902         GET_RE_DEBUG_FLAGS_DECL;
6903         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6904         for (i = 1; i <= len; i++) {
6905             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6906                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6907                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6908             }
6909         PerlIO_printf(Perl_debug_log, "\n");
6910     });
6911 #endif
6912
6913 #ifdef USE_ITHREADS
6914     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6915      * by setting the regexp SV to readonly-only instead. If the
6916      * pattern's been recompiled, the USEDness should remain. */
6917     if (old_re && SvREADONLY(old_re))
6918         SvREADONLY_on(rx);
6919 #endif
6920     return rx;
6921 }
6922
6923
6924 SV*
6925 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6926                     const U32 flags)
6927 {
6928     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6929
6930     PERL_UNUSED_ARG(value);
6931
6932     if (flags & RXapif_FETCH) {
6933         return reg_named_buff_fetch(rx, key, flags);
6934     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6935         Perl_croak_no_modify();
6936         return NULL;
6937     } else if (flags & RXapif_EXISTS) {
6938         return reg_named_buff_exists(rx, key, flags)
6939             ? &PL_sv_yes
6940             : &PL_sv_no;
6941     } else if (flags & RXapif_REGNAMES) {
6942         return reg_named_buff_all(rx, flags);
6943     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6944         return reg_named_buff_scalar(rx, flags);
6945     } else {
6946         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6947         return NULL;
6948     }
6949 }
6950
6951 SV*
6952 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6953                          const U32 flags)
6954 {
6955     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6956     PERL_UNUSED_ARG(lastkey);
6957
6958     if (flags & RXapif_FIRSTKEY)
6959         return reg_named_buff_firstkey(rx, flags);
6960     else if (flags & RXapif_NEXTKEY)
6961         return reg_named_buff_nextkey(rx, flags);
6962     else {
6963         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6964         return NULL;
6965     }
6966 }
6967
6968 SV*
6969 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6970                           const U32 flags)
6971 {
6972     AV *retarray = NULL;
6973     SV *ret;
6974     struct regexp *const rx = ReANY(r);
6975
6976     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6977
6978     if (flags & RXapif_ALL)
6979         retarray=newAV();
6980
6981     if (rx && RXp_PAREN_NAMES(rx)) {
6982         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6983         if (he_str) {
6984             IV i;
6985             SV* sv_dat=HeVAL(he_str);
6986             I32 *nums=(I32*)SvPVX(sv_dat);
6987             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6988                 if ((I32)(rx->nparens) >= nums[i]
6989                     && rx->offs[nums[i]].start != -1
6990                     && rx->offs[nums[i]].end != -1)
6991                 {
6992                     ret = newSVpvs("");
6993                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6994                     if (!retarray)
6995                         return ret;
6996                 } else {
6997                     if (retarray)
6998                         ret = newSVsv(&PL_sv_undef);
6999                 }
7000                 if (retarray)
7001                     av_push(retarray, ret);
7002             }
7003             if (retarray)
7004                 return newRV_noinc(MUTABLE_SV(retarray));
7005         }
7006     }
7007     return NULL;
7008 }
7009
7010 bool
7011 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7012                            const U32 flags)
7013 {
7014     struct regexp *const rx = ReANY(r);
7015
7016     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7017
7018     if (rx && RXp_PAREN_NAMES(rx)) {
7019         if (flags & RXapif_ALL) {
7020             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7021         } else {
7022             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7023             if (sv) {
7024                 SvREFCNT_dec_NN(sv);
7025                 return TRUE;
7026             } else {
7027                 return FALSE;
7028             }
7029         }
7030     } else {
7031         return FALSE;
7032     }
7033 }
7034
7035 SV*
7036 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7037 {
7038     struct regexp *const rx = ReANY(r);
7039
7040     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7041
7042     if ( rx && RXp_PAREN_NAMES(rx) ) {
7043         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7044
7045         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7046     } else {
7047         return FALSE;
7048     }
7049 }
7050
7051 SV*
7052 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7053 {
7054     struct regexp *const rx = ReANY(r);
7055     GET_RE_DEBUG_FLAGS_DECL;
7056
7057     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7058
7059     if (rx && RXp_PAREN_NAMES(rx)) {
7060         HV *hv = RXp_PAREN_NAMES(rx);
7061         HE *temphe;
7062         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7063             IV i;
7064             IV parno = 0;
7065             SV* sv_dat = HeVAL(temphe);
7066             I32 *nums = (I32*)SvPVX(sv_dat);
7067             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7068                 if ((I32)(rx->lastparen) >= nums[i] &&
7069                     rx->offs[nums[i]].start != -1 &&
7070                     rx->offs[nums[i]].end != -1)
7071                 {
7072                     parno = nums[i];
7073                     break;
7074                 }
7075             }
7076             if (parno || flags & RXapif_ALL) {
7077                 return newSVhek(HeKEY_hek(temphe));
7078             }
7079         }
7080     }
7081     return NULL;
7082 }
7083
7084 SV*
7085 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7086 {
7087     SV *ret;
7088     AV *av;
7089     SSize_t length;
7090     struct regexp *const rx = ReANY(r);
7091
7092     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7093
7094     if (rx && RXp_PAREN_NAMES(rx)) {
7095         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7096             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7097         } else if (flags & RXapif_ONE) {
7098             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7099             av = MUTABLE_AV(SvRV(ret));
7100             length = av_len(av);
7101             SvREFCNT_dec_NN(ret);
7102             return newSViv(length + 1);
7103         } else {
7104             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7105             return NULL;
7106         }
7107     }
7108     return &PL_sv_undef;
7109 }
7110
7111 SV*
7112 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7113 {
7114     struct regexp *const rx = ReANY(r);
7115     AV *av = newAV();
7116
7117     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7118
7119     if (rx && RXp_PAREN_NAMES(rx)) {
7120         HV *hv= RXp_PAREN_NAMES(rx);
7121         HE *temphe;
7122         (void)hv_iterinit(hv);
7123         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7124             IV i;
7125             IV parno = 0;
7126             SV* sv_dat = HeVAL(temphe);
7127             I32 *nums = (I32*)SvPVX(sv_dat);
7128             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7129                 if ((I32)(rx->lastparen) >= nums[i] &&
7130                     rx->offs[nums[i]].start != -1 &&
7131                     rx->offs[nums[i]].end != -1)
7132                 {
7133                     parno = nums[i];
7134                     break;
7135                 }
7136             }
7137             if (parno || flags & RXapif_ALL) {
7138                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7139             }
7140         }
7141     }
7142
7143     return newRV_noinc(MUTABLE_SV(av));
7144 }
7145
7146 void
7147 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7148                              SV * const sv)
7149 {
7150     struct regexp *const rx = ReANY(r);
7151     char *s = NULL;
7152     SSize_t i = 0;
7153     SSize_t s1, t1;
7154     I32 n = paren;
7155
7156     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7157
7158     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7159            || n == RX_BUFF_IDX_CARET_FULLMATCH
7160            || n == RX_BUFF_IDX_CARET_POSTMATCH
7161        )
7162     {
7163         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7164         if (!keepcopy) {
7165             /* on something like
7166              *    $r = qr/.../;
7167              *    /$qr/p;
7168              * the KEEPCOPY is set on the PMOP rather than the regex */
7169             if (PL_curpm && r == PM_GETRE(PL_curpm))
7170                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7171         }
7172         if (!keepcopy)
7173             goto ret_undef;
7174     }
7175
7176     if (!rx->subbeg)
7177         goto ret_undef;
7178
7179     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7180         /* no need to distinguish between them any more */
7181         n = RX_BUFF_IDX_FULLMATCH;
7182
7183     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7184         && rx->offs[0].start != -1)
7185     {
7186         /* $`, ${^PREMATCH} */
7187         i = rx->offs[0].start;
7188         s = rx->subbeg;
7189     }
7190     else
7191     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7192         && rx->offs[0].end != -1)
7193     {
7194         /* $', ${^POSTMATCH} */
7195         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7196         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7197     }
7198     else
7199     if ( 0 <= n && n <= (I32)rx->nparens &&
7200         (s1 = rx->offs[n].start) != -1 &&
7201         (t1 = rx->offs[n].end) != -1)
7202     {
7203         /* $&, ${^MATCH},  $1 ... */
7204         i = t1 - s1;
7205         s = rx->subbeg + s1 - rx->suboffset;
7206     } else {
7207         goto ret_undef;
7208     }
7209
7210     assert(s >= rx->subbeg);
7211     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7212     if (i >= 0) {
7213 #if NO_TAINT_SUPPORT
7214         sv_setpvn(sv, s, i);
7215 #else
7216         const int oldtainted = TAINT_get;
7217         TAINT_NOT;
7218         sv_setpvn(sv, s, i);
7219         TAINT_set(oldtainted);
7220 #endif
7221         if ( (rx->extflags & RXf_CANY_SEEN)
7222             ? (RXp_MATCH_UTF8(rx)
7223                         && (!i || is_utf8_string((U8*)s, i)))
7224             : (RXp_MATCH_UTF8(rx)) )
7225         {
7226             SvUTF8_on(sv);
7227         }
7228         else
7229             SvUTF8_off(sv);
7230         if (TAINTING_get) {
7231             if (RXp_MATCH_TAINTED(rx)) {
7232                 if (SvTYPE(sv) >= SVt_PVMG) {
7233                     MAGIC* const mg = SvMAGIC(sv);
7234                     MAGIC* mgt;
7235                     TAINT;
7236                     SvMAGIC_set(sv, mg->mg_moremagic);
7237                     SvTAINT(sv);
7238                     if ((mgt = SvMAGIC(sv))) {
7239                         mg->mg_moremagic = mgt;
7240                         SvMAGIC_set(sv, mg);
7241                     }
7242                 } else {
7243                     TAINT;
7244                     SvTAINT(sv);
7245                 }
7246             } else
7247                 SvTAINTED_off(sv);
7248         }
7249     } else {
7250       ret_undef:
7251         sv_setsv(sv,&PL_sv_undef);
7252         return;
7253     }
7254 }
7255
7256 void
7257 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7258                                                          SV const * const value)
7259 {
7260     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7261
7262     PERL_UNUSED_ARG(rx);
7263     PERL_UNUSED_ARG(paren);
7264     PERL_UNUSED_ARG(value);
7265
7266     if (!PL_localizing)
7267         Perl_croak_no_modify();
7268 }
7269
7270 I32
7271 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7272                               const I32 paren)
7273 {
7274     struct regexp *const rx = ReANY(r);
7275     I32 i;
7276     I32 s1, t1;
7277
7278     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7279
7280     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7281         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7282         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7283     )
7284     {
7285         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7286         if (!keepcopy) {
7287             /* on something like
7288              *    $r = qr/.../;
7289              *    /$qr/p;
7290              * the KEEPCOPY is set on the PMOP rather than the regex */
7291             if (PL_curpm && r == PM_GETRE(PL_curpm))
7292                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7293         }
7294         if (!keepcopy)
7295             goto warn_undef;
7296     }
7297
7298     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7299     switch (paren) {
7300       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7301       case RX_BUFF_IDX_PREMATCH:       /* $` */
7302         if (rx->offs[0].start != -1) {
7303                         i = rx->offs[0].start;
7304                         if (i > 0) {
7305                                 s1 = 0;
7306                                 t1 = i;
7307                                 goto getlen;
7308                         }
7309             }
7310         return 0;
7311
7312       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7313       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7314             if (rx->offs[0].end != -1) {
7315                         i = rx->sublen - rx->offs[0].end;
7316                         if (i > 0) {
7317                                 s1 = rx->offs[0].end;
7318                                 t1 = rx->sublen;
7319                                 goto getlen;
7320                         }
7321             }
7322         return 0;
7323
7324       default: /* $& / ${^MATCH}, $1, $2, ... */
7325             if (paren <= (I32)rx->nparens &&
7326             (s1 = rx->offs[paren].start) != -1 &&
7327             (t1 = rx->offs[paren].end) != -1)
7328             {
7329             i = t1 - s1;
7330             goto getlen;
7331         } else {
7332           warn_undef:
7333             if (ckWARN(WARN_UNINITIALIZED))
7334                 report_uninit((const SV *)sv);
7335             return 0;
7336         }
7337     }
7338   getlen:
7339     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7340         const char * const s = rx->subbeg - rx->suboffset + s1;
7341         const U8 *ep;
7342         STRLEN el;
7343
7344         i = t1 - s1;
7345         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7346                         i = el;
7347     }
7348     return i;
7349 }
7350
7351 SV*
7352 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7353 {
7354     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7355         PERL_UNUSED_ARG(rx);
7356         if (0)
7357             return NULL;
7358         else
7359             return newSVpvs("Regexp");
7360 }
7361
7362 /* Scans the name of a named buffer from the pattern.
7363  * If flags is REG_RSN_RETURN_NULL returns null.
7364  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7365  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7366  * to the parsed name as looked up in the RExC_paren_names hash.
7367  * If there is an error throws a vFAIL().. type exception.
7368  */
7369
7370 #define REG_RSN_RETURN_NULL    0
7371 #define REG_RSN_RETURN_NAME    1
7372 #define REG_RSN_RETURN_DATA    2
7373
7374 STATIC SV*
7375 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7376 {
7377     char *name_start = RExC_parse;
7378
7379     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7380
7381     assert (RExC_parse <= RExC_end);
7382     if (RExC_parse == RExC_end) NOOP;
7383     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7384          /* skip IDFIRST by using do...while */
7385         if (UTF)
7386             do {
7387                 RExC_parse += UTF8SKIP(RExC_parse);
7388             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7389         else
7390             do {
7391                 RExC_parse++;
7392             } while (isWORDCHAR(*RExC_parse));
7393     } else {
7394         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7395         vFAIL("Group name must start with a non-digit word character");
7396     }
7397     if ( flags ) {
7398         SV* sv_name
7399             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7400                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7401         if ( flags == REG_RSN_RETURN_NAME)
7402             return sv_name;
7403         else if (flags==REG_RSN_RETURN_DATA) {
7404             HE *he_str = NULL;
7405             SV *sv_dat = NULL;
7406             if ( ! sv_name )      /* should not happen*/
7407                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7408             if (RExC_paren_names)
7409                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7410             if ( he_str )
7411                 sv_dat = HeVAL(he_str);
7412             if ( ! sv_dat )
7413                 vFAIL("Reference to nonexistent named group");
7414             return sv_dat;
7415         }
7416         else {
7417             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7418                        (unsigned long) flags);
7419         }
7420         assert(0); /* NOT REACHED */
7421     }
7422     return NULL;
7423 }
7424
7425 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7426     int rem=(int)(RExC_end - RExC_parse);                       \
7427     int cut;                                                    \
7428     int num;                                                    \
7429     int iscut=0;                                                \
7430     if (rem>10) {                                               \
7431         rem=10;                                                 \
7432         iscut=1;                                                \
7433     }                                                           \
7434     cut=10-rem;                                                 \
7435     if (RExC_lastparse!=RExC_parse)                             \
7436         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7437             rem, RExC_parse,                                    \
7438             cut + 4,                                            \
7439             iscut ? "..." : "<"                                 \
7440         );                                                      \
7441     else                                                        \
7442         PerlIO_printf(Perl_debug_log,"%16s","");                \
7443                                                                 \
7444     if (SIZE_ONLY)                                              \
7445        num = RExC_size + 1;                                     \
7446     else                                                        \
7447        num=REG_NODE_NUM(RExC_emit);                             \
7448     if (RExC_lastnum!=num)                                      \
7449        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7450     else                                                        \
7451        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7452     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7453         (int)((depth*2)), "",                                   \
7454         (funcname)                                              \
7455     );                                                          \
7456     RExC_lastnum=num;                                           \
7457     RExC_lastparse=RExC_parse;                                  \
7458 })
7459
7460
7461
7462 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7463     DEBUG_PARSE_MSG((funcname));                            \
7464     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7465 })
7466 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7467     DEBUG_PARSE_MSG((funcname));                            \
7468     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7469 })
7470
7471 /* This section of code defines the inversion list object and its methods.  The
7472  * interfaces are highly subject to change, so as much as possible is static to
7473  * this file.  An inversion list is here implemented as a malloc'd C UV array
7474  * as an SVt_INVLIST scalar.
7475  *
7476  * An inversion list for Unicode is an array of code points, sorted by ordinal
7477  * number.  The zeroth element is the first code point in the list.  The 1th
7478  * element is the first element beyond that not in the list.  In other words,
7479  * the first range is
7480  *  invlist[0]..(invlist[1]-1)
7481  * The other ranges follow.  Thus every element whose index is divisible by two
7482  * marks the beginning of a range that is in the list, and every element not
7483  * divisible by two marks the beginning of a range not in the list.  A single
7484  * element inversion list that contains the single code point N generally
7485  * consists of two elements
7486  *  invlist[0] == N
7487  *  invlist[1] == N+1
7488  * (The exception is when N is the highest representable value on the
7489  * machine, in which case the list containing just it would be a single
7490  * element, itself.  By extension, if the last range in the list extends to
7491  * infinity, then the first element of that range will be in the inversion list
7492  * at a position that is divisible by two, and is the final element in the
7493  * list.)
7494  * Taking the complement (inverting) an inversion list is quite simple, if the
7495  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7496  * This implementation reserves an element at the beginning of each inversion
7497  * list to always contain 0; there is an additional flag in the header which
7498  * indicates if the list begins at the 0, or is offset to begin at the next
7499  * element.
7500  *
7501  * More about inversion lists can be found in "Unicode Demystified"
7502  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7503  * More will be coming when functionality is added later.
7504  *
7505  * The inversion list data structure is currently implemented as an SV pointing
7506  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7507  * array of UV whose memory management is automatically handled by the existing
7508  * facilities for SV's.
7509  *
7510  * Some of the methods should always be private to the implementation, and some
7511  * should eventually be made public */
7512
7513 /* The header definitions are in F<inline_invlist.c> */
7514
7515 PERL_STATIC_INLINE UV*
7516 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7517 {
7518     /* Returns a pointer to the first element in the inversion list's array.
7519      * This is called upon initialization of an inversion list.  Where the
7520      * array begins depends on whether the list has the code point U+0000 in it
7521      * or not.  The other parameter tells it whether the code that follows this
7522      * call is about to put a 0 in the inversion list or not.  The first
7523      * element is either the element reserved for 0, if TRUE, or the element
7524      * after it, if FALSE */
7525
7526     bool* offset = get_invlist_offset_addr(invlist);
7527     UV* zero_addr = (UV *) SvPVX(invlist);
7528
7529     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7530
7531     /* Must be empty */
7532     assert(! _invlist_len(invlist));
7533
7534     *zero_addr = 0;
7535
7536     /* 1^1 = 0; 1^0 = 1 */
7537     *offset = 1 ^ will_have_0;
7538     return zero_addr + *offset;
7539 }
7540
7541 PERL_STATIC_INLINE UV*
7542 S_invlist_array(pTHX_ SV* const invlist)
7543 {
7544     /* Returns the pointer to the inversion list's array.  Every time the
7545      * length changes, this needs to be called in case malloc or realloc moved
7546      * it */
7547
7548     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7549
7550     /* Must not be empty.  If these fail, you probably didn't check for <len>
7551      * being non-zero before trying to get the array */
7552     assert(_invlist_len(invlist));
7553
7554     /* The very first element always contains zero, The array begins either
7555      * there, or if the inversion list is offset, at the element after it.
7556      * The offset header field determines which; it contains 0 or 1 to indicate
7557      * how much additionally to add */
7558     assert(0 == *(SvPVX(invlist)));
7559     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7560 }
7561
7562 PERL_STATIC_INLINE void
7563 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7564 {
7565     /* Sets the current number of elements stored in the inversion list.
7566      * Updates SvCUR correspondingly */
7567
7568     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7569
7570     assert(SvTYPE(invlist) == SVt_INVLIST);
7571
7572     SvCUR_set(invlist,
7573               (len == 0)
7574                ? 0
7575                : TO_INTERNAL_SIZE(len + offset));
7576     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7577 }
7578
7579 PERL_STATIC_INLINE IV*
7580 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7581 {
7582     /* Return the address of the IV that is reserved to hold the cached index
7583      * */
7584
7585     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7586
7587     assert(SvTYPE(invlist) == SVt_INVLIST);
7588
7589     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7590 }
7591
7592 PERL_STATIC_INLINE IV
7593 S_invlist_previous_index(pTHX_ SV* const invlist)
7594 {
7595     /* Returns cached index of previous search */
7596
7597     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7598
7599     return *get_invlist_previous_index_addr(invlist);
7600 }
7601
7602 PERL_STATIC_INLINE void
7603 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7604 {
7605     /* Caches <index> for later retrieval */
7606
7607     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7608
7609     assert(index == 0 || index < (int) _invlist_len(invlist));
7610
7611     *get_invlist_previous_index_addr(invlist) = index;
7612 }
7613
7614 PERL_STATIC_INLINE UV
7615 S_invlist_max(pTHX_ SV* const invlist)
7616 {
7617     /* Returns the maximum number of elements storable in the inversion list's
7618      * array, without having to realloc() */
7619
7620     PERL_ARGS_ASSERT_INVLIST_MAX;
7621
7622     assert(SvTYPE(invlist) == SVt_INVLIST);
7623
7624     /* Assumes worst case, in which the 0 element is not counted in the
7625      * inversion list, so subtracts 1 for that */
7626     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7627            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7628            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7629 }
7630
7631 #ifndef PERL_IN_XSUB_RE
7632 SV*
7633 Perl__new_invlist(pTHX_ IV initial_size)
7634 {
7635
7636     /* Return a pointer to a newly constructed inversion list, with enough
7637      * space to store 'initial_size' elements.  If that number is negative, a
7638      * system default is used instead */
7639
7640     SV* new_list;
7641
7642     if (initial_size < 0) {
7643         initial_size = 10;
7644     }
7645
7646     /* Allocate the initial space */
7647     new_list = newSV_type(SVt_INVLIST);
7648
7649     /* First 1 is in case the zero element isn't in the list; second 1 is for
7650      * trailing NUL */
7651     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7652     invlist_set_len(new_list, 0, 0);
7653
7654     /* Force iterinit() to be used to get iteration to work */
7655     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7656
7657     *get_invlist_previous_index_addr(new_list) = 0;
7658
7659     return new_list;
7660 }
7661 #endif
7662
7663 STATIC SV*
7664 S__new_invlist_C_array(pTHX_ const UV* const list)
7665 {
7666     /* Return a pointer to a newly constructed inversion list, initialized to
7667      * point to <list>, which has to be in the exact correct inversion list
7668      * form, including internal fields.  Thus this is a dangerous routine that
7669      * should not be used in the wrong hands.  The passed in 'list' contains
7670      * several header fields at the beginning that are not part of the
7671      * inversion list body proper */
7672
7673     const STRLEN length = (STRLEN) list[0];
7674     const UV version_id =          list[1];
7675     const bool offset   =    cBOOL(list[2]);
7676 #define HEADER_LENGTH 3
7677     /* If any of the above changes in any way, you must change HEADER_LENGTH
7678      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7679      *      perl -E 'say int(rand 2**31-1)'
7680      */
7681 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7682                                         data structure type, so that one being
7683                                         passed in can be validated to be an
7684                                         inversion list of the correct vintage.
7685                                        */
7686
7687     SV* invlist = newSV_type(SVt_INVLIST);
7688
7689     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7690
7691     if (version_id != INVLIST_VERSION_ID) {
7692         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7693     }
7694
7695     /* The generated array passed in includes header elements that aren't part
7696      * of the list proper, so start it just after them */
7697     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7698
7699     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7700                                shouldn't touch it */
7701
7702     *(get_invlist_offset_addr(invlist)) = offset;
7703
7704     /* The 'length' passed to us is the physical number of elements in the
7705      * inversion list.  But if there is an offset the logical number is one
7706      * less than that */
7707     invlist_set_len(invlist, length  - offset, offset);
7708
7709     invlist_set_previous_index(invlist, 0);
7710
7711     /* Initialize the iteration pointer. */
7712     invlist_iterfinish(invlist);
7713
7714     return invlist;
7715 }
7716
7717 STATIC void
7718 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7719 {
7720     /* Grow the maximum size of an inversion list */
7721
7722     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7723
7724     assert(SvTYPE(invlist) == SVt_INVLIST);
7725
7726     /* Add one to account for the zero element at the beginning which may not
7727      * be counted by the calling parameters */
7728     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7729 }
7730
7731 PERL_STATIC_INLINE void
7732 S_invlist_trim(pTHX_ SV* const invlist)
7733 {
7734     PERL_ARGS_ASSERT_INVLIST_TRIM;
7735
7736     assert(SvTYPE(invlist) == SVt_INVLIST);
7737
7738     /* Change the length of the inversion list to how many entries it currently
7739      * has */
7740     SvPV_shrink_to_cur((SV *) invlist);
7741 }
7742
7743 STATIC void
7744 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7745 {
7746    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7747     * the end of the inversion list.  The range must be above any existing
7748     * ones. */
7749
7750     UV* array;
7751     UV max = invlist_max(invlist);
7752     UV len = _invlist_len(invlist);
7753     bool offset;
7754
7755     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7756
7757     if (len == 0) { /* Empty lists must be initialized */
7758         offset = start != 0;
7759         array = _invlist_array_init(invlist, ! offset);
7760     }
7761     else {
7762         /* Here, the existing list is non-empty. The current max entry in the
7763          * list is generally the first value not in the set, except when the
7764          * set extends to the end of permissible values, in which case it is
7765          * the first entry in that final set, and so this call is an attempt to
7766          * append out-of-order */
7767
7768         UV final_element = len - 1;
7769         array = invlist_array(invlist);
7770         if (array[final_element] > start
7771             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7772         {
7773             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7774                        array[final_element], start,
7775                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7776         }
7777
7778         /* Here, it is a legal append.  If the new range begins with the first
7779          * value not in the set, it is extending the set, so the new first
7780          * value not in the set is one greater than the newly extended range.
7781          * */
7782         offset = *get_invlist_offset_addr(invlist);
7783         if (array[final_element] == start) {
7784             if (end != UV_MAX) {
7785                 array[final_element] = end + 1;
7786             }
7787             else {
7788                 /* But if the end is the maximum representable on the machine,
7789                  * just let the range that this would extend to have no end */
7790                 invlist_set_len(invlist, len - 1, offset);
7791             }
7792             return;
7793         }
7794     }
7795
7796     /* Here the new range doesn't extend any existing set.  Add it */
7797
7798     len += 2;   /* Includes an element each for the start and end of range */
7799
7800     /* If wll overflow the existing space, extend, which may cause the array to
7801      * be moved */
7802     if (max < len) {
7803         invlist_extend(invlist, len);
7804
7805         /* Have to set len here to avoid assert failure in invlist_array() */
7806         invlist_set_len(invlist, len, offset);
7807
7808         array = invlist_array(invlist);
7809     }
7810     else {
7811         invlist_set_len(invlist, len, offset);
7812     }
7813
7814     /* The next item on the list starts the range, the one after that is
7815      * one past the new range.  */
7816     array[len - 2] = start;
7817     if (end != UV_MAX) {
7818         array[len - 1] = end + 1;
7819     }
7820     else {
7821         /* But if the end is the maximum representable on the machine, just let
7822          * the range have no end */
7823         invlist_set_len(invlist, len - 1, offset);
7824     }
7825 }
7826
7827 #ifndef PERL_IN_XSUB_RE
7828
7829 IV
7830 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7831 {
7832     /* Searches the inversion list for the entry that contains the input code
7833      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7834      * return value is the index into the list's array of the range that
7835      * contains <cp> */
7836
7837     IV low = 0;
7838     IV mid;
7839     IV high = _invlist_len(invlist);
7840     const IV highest_element = high - 1;
7841     const UV* array;
7842
7843     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7844
7845     /* If list is empty, return failure. */
7846     if (high == 0) {
7847         return -1;
7848     }
7849
7850     /* (We can't get the array unless we know the list is non-empty) */
7851     array = invlist_array(invlist);
7852
7853     mid = invlist_previous_index(invlist);
7854     assert(mid >=0 && mid <= highest_element);
7855
7856     /* <mid> contains the cache of the result of the previous call to this
7857      * function (0 the first time).  See if this call is for the same result,
7858      * or if it is for mid-1.  This is under the theory that calls to this
7859      * function will often be for related code points that are near each other.
7860      * And benchmarks show that caching gives better results.  We also test
7861      * here if the code point is within the bounds of the list.  These tests
7862      * replace others that would have had to be made anyway to make sure that
7863      * the array bounds were not exceeded, and these give us extra information
7864      * at the same time */
7865     if (cp >= array[mid]) {
7866         if (cp >= array[highest_element]) {
7867             return highest_element;
7868         }
7869
7870         /* Here, array[mid] <= cp < array[highest_element].  This means that
7871          * the final element is not the answer, so can exclude it; it also
7872          * means that <mid> is not the final element, so can refer to 'mid + 1'
7873          * safely */
7874         if (cp < array[mid + 1]) {
7875             return mid;
7876         }
7877         high--;
7878         low = mid + 1;
7879     }
7880     else { /* cp < aray[mid] */
7881         if (cp < array[0]) { /* Fail if outside the array */
7882             return -1;
7883         }
7884         high = mid;
7885         if (cp >= array[mid - 1]) {
7886             goto found_entry;
7887         }
7888     }
7889
7890     /* Binary search.  What we are looking for is <i> such that
7891      *  array[i] <= cp < array[i+1]
7892      * The loop below converges on the i+1.  Note that there may not be an
7893      * (i+1)th element in the array, and things work nonetheless */
7894     while (low < high) {
7895         mid = (low + high) / 2;
7896         assert(mid <= highest_element);
7897         if (array[mid] <= cp) { /* cp >= array[mid] */
7898             low = mid + 1;
7899
7900             /* We could do this extra test to exit the loop early.
7901             if (cp < array[low]) {
7902                 return mid;
7903             }
7904             */
7905         }
7906         else { /* cp < array[mid] */
7907             high = mid;
7908         }
7909     }
7910
7911   found_entry:
7912     high--;
7913     invlist_set_previous_index(invlist, high);
7914     return high;
7915 }
7916
7917 void
7918 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7919 {
7920     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7921      * but is used when the swash has an inversion list.  This makes this much
7922      * faster, as it uses a binary search instead of a linear one.  This is
7923      * intimately tied to that function, and perhaps should be in utf8.c,
7924      * except it is intimately tied to inversion lists as well.  It assumes
7925      * that <swatch> is all 0's on input */
7926
7927     UV current = start;
7928     const IV len = _invlist_len(invlist);
7929     IV i;
7930     const UV * array;
7931
7932     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7933
7934     if (len == 0) { /* Empty inversion list */
7935         return;
7936     }
7937
7938     array = invlist_array(invlist);
7939
7940     /* Find which element it is */
7941     i = _invlist_search(invlist, start);
7942
7943     /* We populate from <start> to <end> */
7944     while (current < end) {
7945         UV upper;
7946
7947         /* The inversion list gives the results for every possible code point
7948          * after the first one in the list.  Only those ranges whose index is
7949          * even are ones that the inversion list matches.  For the odd ones,
7950          * and if the initial code point is not in the list, we have to skip
7951          * forward to the next element */
7952         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7953             i++;
7954             if (i >= len) { /* Finished if beyond the end of the array */
7955                 return;
7956             }
7957             current = array[i];
7958             if (current >= end) {   /* Finished if beyond the end of what we
7959                                        are populating */
7960                 if (LIKELY(end < UV_MAX)) {
7961                     return;
7962                 }
7963
7964                 /* We get here when the upper bound is the maximum
7965                  * representable on the machine, and we are looking for just
7966                  * that code point.  Have to special case it */
7967                 i = len;
7968                 goto join_end_of_list;
7969             }
7970         }
7971         assert(current >= start);
7972
7973         /* The current range ends one below the next one, except don't go past
7974          * <end> */
7975         i++;
7976         upper = (i < len && array[i] < end) ? array[i] : end;
7977
7978         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7979          * for each code point in it */
7980         for (; current < upper; current++) {
7981             const STRLEN offset = (STRLEN)(current - start);
7982             swatch[offset >> 3] |= 1 << (offset & 7);
7983         }
7984
7985     join_end_of_list:
7986
7987         /* Quit if at the end of the list */
7988         if (i >= len) {
7989
7990             /* But first, have to deal with the highest possible code point on
7991              * the platform.  The previous code assumes that <end> is one
7992              * beyond where we want to populate, but that is impossible at the
7993              * platform's infinity, so have to handle it specially */
7994             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7995             {
7996                 const STRLEN offset = (STRLEN)(end - start);
7997                 swatch[offset >> 3] |= 1 << (offset & 7);
7998             }
7999             return;
8000         }
8001
8002         /* Advance to the next range, which will be for code points not in the
8003          * inversion list */
8004         current = array[i];
8005     }
8006
8007     return;
8008 }
8009
8010 void
8011 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8012 {
8013     /* Take the union of two inversion lists and point <output> to it.  *output
8014      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8015      * the reference count to that list will be decremented if not already a
8016      * temporary (mortal); otherwise *output will be made correspondingly
8017      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8018      * second list is returned.  If <complement_b> is TRUE, the union is taken
8019      * of the complement (inversion) of <b> instead of b itself.
8020      *
8021      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8022      * Richard Gillam, published by Addison-Wesley, and explained at some
8023      * length there.  The preface says to incorporate its examples into your
8024      * code at your own risk.
8025      *
8026      * The algorithm is like a merge sort.
8027      *
8028      * XXX A potential performance improvement is to keep track as we go along
8029      * if only one of the inputs contributes to the result, meaning the other
8030      * is a subset of that one.  In that case, we can skip the final copy and
8031      * return the larger of the input lists, but then outside code might need
8032      * to keep track of whether to free the input list or not */
8033
8034     const UV* array_a;    /* a's array */
8035     const UV* array_b;
8036     UV len_a;       /* length of a's array */
8037     UV len_b;
8038
8039     SV* u;                      /* the resulting union */
8040     UV* array_u;
8041     UV len_u;
8042
8043     UV i_a = 0;             /* current index into a's array */
8044     UV i_b = 0;
8045     UV i_u = 0;
8046
8047     /* running count, as explained in the algorithm source book; items are
8048      * stopped accumulating and are output when the count changes to/from 0.
8049      * The count is incremented when we start a range that's in the set, and
8050      * decremented when we start a range that's not in the set.  So its range
8051      * is 0 to 2.  Only when the count is zero is something not in the set.
8052      */
8053     UV count = 0;
8054
8055     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8056     assert(a != b);
8057
8058     /* If either one is empty, the union is the other one */
8059     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8060         bool make_temp = FALSE; /* Should we mortalize the result? */
8061
8062         if (*output == a) {
8063             if (a != NULL) {
8064                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8065                     SvREFCNT_dec_NN(a);
8066                 }
8067             }
8068         }
8069         if (*output != b) {
8070             *output = invlist_clone(b);
8071             if (complement_b) {
8072                 _invlist_invert(*output);
8073             }
8074         } /* else *output already = b; */
8075
8076         if (make_temp) {
8077             sv_2mortal(*output);
8078         }
8079         return;
8080     }
8081     else if ((len_b = _invlist_len(b)) == 0) {
8082         bool make_temp = FALSE;
8083         if (*output == b) {
8084             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8085                 SvREFCNT_dec_NN(b);
8086             }
8087         }
8088
8089         /* The complement of an empty list is a list that has everything in it,
8090          * so the union with <a> includes everything too */
8091         if (complement_b) {
8092             if (a == *output) {
8093                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8094                     SvREFCNT_dec_NN(a);
8095                 }
8096             }
8097             *output = _new_invlist(1);
8098             _append_range_to_invlist(*output, 0, UV_MAX);
8099         }
8100         else if (*output != a) {
8101             *output = invlist_clone(a);
8102         }
8103         /* else *output already = a; */
8104
8105         if (make_temp) {
8106             sv_2mortal(*output);
8107         }
8108         return;
8109     }
8110
8111     /* Here both lists exist and are non-empty */
8112     array_a = invlist_array(a);
8113     array_b = invlist_array(b);
8114
8115     /* If are to take the union of 'a' with the complement of b, set it
8116      * up so are looking at b's complement. */
8117     if (complement_b) {
8118
8119         /* To complement, we invert: if the first element is 0, remove it.  To
8120          * do this, we just pretend the array starts one later */
8121         if (array_b[0] == 0) {
8122             array_b++;
8123             len_b--;
8124         }
8125         else {
8126
8127             /* But if the first element is not zero, we pretend the list starts
8128              * at the 0 that is always stored immediately before the array. */
8129             array_b--;
8130             len_b++;
8131         }
8132     }
8133
8134     /* Size the union for the worst case: that the sets are completely
8135      * disjoint */
8136     u = _new_invlist(len_a + len_b);
8137
8138     /* Will contain U+0000 if either component does */
8139     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8140                                       || (len_b > 0 && array_b[0] == 0));
8141
8142     /* Go through each list item by item, stopping when exhausted one of
8143      * them */
8144     while (i_a < len_a && i_b < len_b) {
8145         UV cp;      /* The element to potentially add to the union's array */
8146         bool cp_in_set;   /* is it in the the input list's set or not */
8147
8148         /* We need to take one or the other of the two inputs for the union.
8149          * Since we are merging two sorted lists, we take the smaller of the
8150          * next items.  In case of a tie, we take the one that is in its set
8151          * first.  If we took one not in the set first, it would decrement the
8152          * count, possibly to 0 which would cause it to be output as ending the
8153          * range, and the next time through we would take the same number, and
8154          * output it again as beginning the next range.  By doing it the
8155          * opposite way, there is no possibility that the count will be
8156          * momentarily decremented to 0, and thus the two adjoining ranges will
8157          * be seamlessly merged.  (In a tie and both are in the set or both not
8158          * in the set, it doesn't matter which we take first.) */
8159         if (array_a[i_a] < array_b[i_b]
8160             || (array_a[i_a] == array_b[i_b]
8161                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8162         {
8163             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8164             cp= array_a[i_a++];
8165         }
8166         else {
8167             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8168             cp = array_b[i_b++];
8169         }
8170
8171         /* Here, have chosen which of the two inputs to look at.  Only output
8172          * if the running count changes to/from 0, which marks the
8173          * beginning/end of a range in that's in the set */
8174         if (cp_in_set) {
8175             if (count == 0) {
8176                 array_u[i_u++] = cp;
8177             }
8178             count++;
8179         }
8180         else {
8181             count--;
8182             if (count == 0) {
8183                 array_u[i_u++] = cp;
8184             }
8185         }
8186     }
8187
8188     /* Here, we are finished going through at least one of the lists, which
8189      * means there is something remaining in at most one.  We check if the list
8190      * that hasn't been exhausted is positioned such that we are in the middle
8191      * of a range in its set or not.  (i_a and i_b point to the element beyond
8192      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8193      * is potentially more to output.
8194      * There are four cases:
8195      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8196      *     in the union is entirely from the non-exhausted set.
8197      *  2) Both were in their sets, count is 2.  Nothing further should
8198      *     be output, as everything that remains will be in the exhausted
8199      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8200      *     that
8201      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8202      *     Nothing further should be output because the union includes
8203      *     everything from the exhausted set.  Not decrementing ensures that.
8204      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8205      *     decrementing to 0 insures that we look at the remainder of the
8206      *     non-exhausted set */
8207     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8208         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8209     {
8210         count--;
8211     }
8212
8213     /* The final length is what we've output so far, plus what else is about to
8214      * be output.  (If 'count' is non-zero, then the input list we exhausted
8215      * has everything remaining up to the machine's limit in its set, and hence
8216      * in the union, so there will be no further output. */
8217     len_u = i_u;
8218     if (count == 0) {
8219         /* At most one of the subexpressions will be non-zero */
8220         len_u += (len_a - i_a) + (len_b - i_b);
8221     }
8222
8223     /* Set result to final length, which can change the pointer to array_u, so
8224      * re-find it */
8225     if (len_u != _invlist_len(u)) {
8226         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8227         invlist_trim(u);
8228         array_u = invlist_array(u);
8229     }
8230
8231     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8232      * the other) ended with everything above it not in its set.  That means
8233      * that the remaining part of the union is precisely the same as the
8234      * non-exhausted list, so can just copy it unchanged.  (If both list were
8235      * exhausted at the same time, then the operations below will be both 0.)
8236      */
8237     if (count == 0) {
8238         IV copy_count; /* At most one will have a non-zero copy count */
8239         if ((copy_count = len_a - i_a) > 0) {
8240             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8241         }
8242         else if ((copy_count = len_b - i_b) > 0) {
8243             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8244         }
8245     }
8246
8247     /*  We may be removing a reference to one of the inputs.  If so, the output
8248      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8249      *  count decremented) */
8250     if (a == *output || b == *output) {
8251         assert(! invlist_is_iterating(*output));
8252         if ((SvTEMP(*output))) {
8253             sv_2mortal(u);
8254         }
8255         else {
8256             SvREFCNT_dec_NN(*output);
8257         }
8258     }
8259
8260     *output = u;
8261
8262     return;
8263 }
8264
8265 void
8266 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8267 {
8268     /* Take the intersection of two inversion lists and point <i> to it.  *i
8269      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8270      * the reference count to that list will be decremented if not already a
8271      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8272      * The first list, <a>, may be NULL, in which case an empty list is
8273      * returned.  If <complement_b> is TRUE, the result will be the
8274      * intersection of <a> and the complement (or inversion) of <b> instead of
8275      * <b> directly.
8276      *
8277      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8278      * Richard Gillam, published by Addison-Wesley, and explained at some
8279      * length there.  The preface says to incorporate its examples into your
8280      * code at your own risk.  In fact, it had bugs
8281      *
8282      * The algorithm is like a merge sort, and is essentially the same as the
8283      * union above
8284      */
8285
8286     const UV* array_a;          /* a's array */
8287     const UV* array_b;
8288     UV len_a;   /* length of a's array */
8289     UV len_b;
8290
8291     SV* r;                   /* the resulting intersection */
8292     UV* array_r;
8293     UV len_r;
8294
8295     UV i_a = 0;             /* current index into a's array */
8296     UV i_b = 0;
8297     UV i_r = 0;
8298
8299     /* running count, as explained in the algorithm source book; items are
8300      * stopped accumulating and are output when the count changes to/from 2.
8301      * The count is incremented when we start a range that's in the set, and
8302      * decremented when we start a range that's not in the set.  So its range
8303      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8304      */
8305     UV count = 0;
8306
8307     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8308     assert(a != b);
8309
8310     /* Special case if either one is empty */
8311     len_a = (a == NULL) ? 0 : _invlist_len(a);
8312     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8313         bool make_temp = FALSE;
8314
8315         if (len_a != 0 && complement_b) {
8316
8317             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8318              * be empty.  Here, also we are using 'b's complement, which hence
8319              * must be every possible code point.  Thus the intersection is
8320              * simply 'a'. */
8321             if (*i != a) {
8322                 if (*i == b) {
8323                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8324                         SvREFCNT_dec_NN(b);
8325                     }
8326                 }
8327
8328                 *i = invlist_clone(a);
8329             }
8330             /* else *i is already 'a' */
8331
8332             if (make_temp) {
8333                 sv_2mortal(*i);
8334             }
8335             return;
8336         }
8337
8338         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8339          * intersection must be empty */
8340         if (*i == a) {
8341             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8342                 SvREFCNT_dec_NN(a);
8343             }
8344         }
8345         else if (*i == b) {
8346             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8347                 SvREFCNT_dec_NN(b);
8348             }
8349         }
8350         *i = _new_invlist(0);
8351         if (make_temp) {
8352             sv_2mortal(*i);
8353         }
8354
8355         return;
8356     }
8357
8358     /* Here both lists exist and are non-empty */
8359     array_a = invlist_array(a);
8360     array_b = invlist_array(b);
8361
8362     /* If are to take the intersection of 'a' with the complement of b, set it
8363      * up so are looking at b's complement. */
8364     if (complement_b) {
8365
8366         /* To complement, we invert: if the first element is 0, remove it.  To
8367          * do this, we just pretend the array starts one later */
8368         if (array_b[0] == 0) {
8369             array_b++;
8370             len_b--;
8371         }
8372         else {
8373
8374             /* But if the first element is not zero, we pretend the list starts
8375              * at the 0 that is always stored immediately before the array. */
8376             array_b--;
8377             len_b++;
8378         }
8379     }
8380
8381     /* Size the intersection for the worst case: that the intersection ends up
8382      * fragmenting everything to be completely disjoint */
8383     r= _new_invlist(len_a + len_b);
8384
8385     /* Will contain U+0000 iff both components do */
8386     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8387                                      && len_b > 0 && array_b[0] == 0);
8388
8389     /* Go through each list item by item, stopping when exhausted one of
8390      * them */
8391     while (i_a < len_a && i_b < len_b) {
8392         UV cp;      /* The element to potentially add to the intersection's
8393                        array */
8394         bool cp_in_set; /* Is it in the input list's set or not */
8395
8396         /* We need to take one or the other of the two inputs for the
8397          * intersection.  Since we are merging two sorted lists, we take the
8398          * smaller of the next items.  In case of a tie, we take the one that
8399          * is not in its set first (a difference from the union algorithm).  If
8400          * we took one in the set first, it would increment the count, possibly
8401          * to 2 which would cause it to be output as starting a range in the
8402          * intersection, and the next time through we would take that same
8403          * number, and output it again as ending the set.  By doing it the
8404          * opposite of this, there is no possibility that the count will be
8405          * momentarily incremented to 2.  (In a tie and both are in the set or
8406          * both not in the set, it doesn't matter which we take first.) */
8407         if (array_a[i_a] < array_b[i_b]
8408             || (array_a[i_a] == array_b[i_b]
8409                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8410         {
8411             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8412             cp= array_a[i_a++];
8413         }
8414         else {
8415             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8416             cp= array_b[i_b++];
8417         }
8418
8419         /* Here, have chosen which of the two inputs to look at.  Only output
8420          * if the running count changes to/from 2, which marks the
8421          * beginning/end of a range that's in the intersection */
8422         if (cp_in_set) {
8423             count++;
8424             if (count == 2) {
8425                 array_r[i_r++] = cp;
8426             }
8427         }
8428         else {
8429             if (count == 2) {
8430                 array_r[i_r++] = cp;
8431             }
8432             count--;
8433         }
8434     }
8435
8436     /* Here, we are finished going through at least one of the lists, which
8437      * means there is something remaining in at most one.  We check if the list
8438      * that has been exhausted is positioned such that we are in the middle
8439      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8440      * the ones we care about.)  There are four cases:
8441      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8442      *     nothing left in the intersection.
8443      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8444      *     above 2.  What should be output is exactly that which is in the
8445      *     non-exhausted set, as everything it has is also in the intersection
8446      *     set, and everything it doesn't have can't be in the intersection
8447      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8448      *     gets incremented to 2.  Like the previous case, the intersection is
8449      *     everything that remains in the non-exhausted set.
8450      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8451      *     remains 1.  And the intersection has nothing more. */
8452     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8453         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8454     {
8455         count++;
8456     }
8457
8458     /* The final length is what we've output so far plus what else is in the
8459      * intersection.  At most one of the subexpressions below will be non-zero */
8460     len_r = i_r;
8461     if (count >= 2) {
8462         len_r += (len_a - i_a) + (len_b - i_b);
8463     }
8464
8465     /* Set result to final length, which can change the pointer to array_r, so
8466      * re-find it */
8467     if (len_r != _invlist_len(r)) {
8468         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8469         invlist_trim(r);
8470         array_r = invlist_array(r);
8471     }
8472
8473     /* Finish outputting any remaining */
8474     if (count >= 2) { /* At most one will have a non-zero copy count */
8475         IV copy_count;
8476         if ((copy_count = len_a - i_a) > 0) {
8477             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8478         }
8479         else if ((copy_count = len_b - i_b) > 0) {
8480             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8481         }
8482     }
8483
8484     /*  We may be removing a reference to one of the inputs.  If so, the output
8485      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8486      *  count decremented) */
8487     if (a == *i || b == *i) {
8488         assert(! invlist_is_iterating(*i));
8489         if (SvTEMP(*i)) {
8490             sv_2mortal(r);
8491         }
8492         else {
8493             SvREFCNT_dec_NN(*i);
8494         }
8495     }
8496
8497     *i = r;
8498
8499     return;
8500 }
8501
8502 SV*
8503 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8504 {
8505     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8506      * set.  A pointer to the inversion list is returned.  This may actually be
8507      * a new list, in which case the passed in one has been destroyed.  The
8508      * passed in inversion list can be NULL, in which case a new one is created
8509      * with just the one range in it */
8510
8511     SV* range_invlist;
8512     UV len;
8513
8514     if (invlist == NULL) {
8515         invlist = _new_invlist(2);
8516         len = 0;
8517     }
8518     else {
8519         len = _invlist_len(invlist);
8520     }
8521
8522     /* If comes after the final entry actually in the list, can just append it
8523      * to the end, */
8524     if (len == 0
8525         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8526             && start >= invlist_array(invlist)[len - 1]))
8527     {
8528         _append_range_to_invlist(invlist, start, end);
8529         return invlist;
8530     }
8531
8532     /* Here, can't just append things, create and return a new inversion list
8533      * which is the union of this range and the existing inversion list */
8534     range_invlist = _new_invlist(2);
8535     _append_range_to_invlist(range_invlist, start, end);
8536
8537     _invlist_union(invlist, range_invlist, &invlist);
8538
8539     /* The temporary can be freed */
8540     SvREFCNT_dec_NN(range_invlist);
8541
8542     return invlist;
8543 }
8544
8545 #endif
8546
8547 PERL_STATIC_INLINE SV*
8548 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8549     return _add_range_to_invlist(invlist, cp, cp);
8550 }
8551
8552 #ifndef PERL_IN_XSUB_RE
8553 void
8554 Perl__invlist_invert(pTHX_ SV* const invlist)
8555 {
8556     /* Complement the input inversion list.  This adds a 0 if the list didn't
8557      * have a zero; removes it otherwise.  As described above, the data
8558      * structure is set up so that this is very efficient */
8559
8560     PERL_ARGS_ASSERT__INVLIST_INVERT;
8561
8562     assert(! invlist_is_iterating(invlist));
8563
8564     /* The inverse of matching nothing is matching everything */
8565     if (_invlist_len(invlist) == 0) {
8566         _append_range_to_invlist(invlist, 0, UV_MAX);
8567         return;
8568     }
8569
8570     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8571 }
8572
8573 void
8574 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8575 {
8576     /* Complement the input inversion list (which must be a Unicode property,
8577      * all of which don't match above the Unicode maximum code point.)  And
8578      * Perl has chosen to not have the inversion match above that either.  This
8579      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8580      */
8581
8582     UV len;
8583     UV* array;
8584
8585     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8586
8587     _invlist_invert(invlist);
8588
8589     len = _invlist_len(invlist);
8590
8591     if (len != 0) { /* If empty do nothing */
8592         array = invlist_array(invlist);
8593         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8594             /* Add 0x110000.  First, grow if necessary */
8595             len++;
8596             if (invlist_max(invlist) < len) {
8597                 invlist_extend(invlist, len);
8598                 array = invlist_array(invlist);
8599             }
8600             invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8601             array[len - 1] = PERL_UNICODE_MAX + 1;
8602         }
8603         else {  /* Remove the 0x110000 */
8604             invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8605         }
8606     }
8607
8608     return;
8609 }
8610 #endif
8611
8612 PERL_STATIC_INLINE SV*
8613 S_invlist_clone(pTHX_ SV* const invlist)
8614 {
8615
8616     /* Return a new inversion list that is a copy of the input one, which is
8617      * unchanged.  The new list will not be mortal even if the old one was. */
8618
8619     /* Need to allocate extra space to accommodate Perl's addition of a
8620      * trailing NUL to SvPV's, since it thinks they are always strings */
8621     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8622     STRLEN physical_length = SvCUR(invlist);
8623     bool offset = *(get_invlist_offset_addr(invlist));
8624
8625     PERL_ARGS_ASSERT_INVLIST_CLONE;
8626
8627     *(get_invlist_offset_addr(new_invlist)) = offset;
8628     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8629     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8630
8631     return new_invlist;
8632 }
8633
8634 PERL_STATIC_INLINE STRLEN*
8635 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8636 {
8637     /* Return the address of the UV that contains the current iteration
8638      * position */
8639
8640     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8641
8642     assert(SvTYPE(invlist) == SVt_INVLIST);
8643
8644     return &(((XINVLIST*) SvANY(invlist))->iterator);
8645 }
8646
8647 PERL_STATIC_INLINE void
8648 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8649 {
8650     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8651
8652     *get_invlist_iter_addr(invlist) = 0;
8653 }
8654
8655 PERL_STATIC_INLINE void
8656 S_invlist_iterfinish(pTHX_ SV* invlist)
8657 {
8658     /* Terminate iterator for invlist.  This is to catch development errors.
8659      * Any iteration that is interrupted before completed should call this
8660      * function.  Functions that add code points anywhere else but to the end
8661      * of an inversion list assert that they are not in the middle of an
8662      * iteration.  If they were, the addition would make the iteration
8663      * problematical: if the iteration hadn't reached the place where things
8664      * were being added, it would be ok */
8665
8666     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8667
8668     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8669 }
8670
8671 STATIC bool
8672 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8673 {
8674     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8675      * This call sets in <*start> and <*end>, the next range in <invlist>.
8676      * Returns <TRUE> if successful and the next call will return the next
8677      * range; <FALSE> if was already at the end of the list.  If the latter,
8678      * <*start> and <*end> are unchanged, and the next call to this function
8679      * will start over at the beginning of the list */
8680
8681     STRLEN* pos = get_invlist_iter_addr(invlist);
8682     UV len = _invlist_len(invlist);
8683     UV *array;
8684
8685     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8686
8687     if (*pos >= len) {
8688         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8689         return FALSE;
8690     }
8691
8692     array = invlist_array(invlist);
8693
8694     *start = array[(*pos)++];
8695
8696     if (*pos >= len) {
8697         *end = UV_MAX;
8698     }
8699     else {
8700         *end = array[(*pos)++] - 1;
8701     }
8702
8703     return TRUE;
8704 }
8705
8706 PERL_STATIC_INLINE bool
8707 S_invlist_is_iterating(pTHX_ SV* const invlist)
8708 {
8709     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8710
8711     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8712 }
8713
8714 PERL_STATIC_INLINE UV
8715 S_invlist_highest(pTHX_ SV* const invlist)
8716 {
8717     /* Returns the highest code point that matches an inversion list.  This API
8718      * has an ambiguity, as it returns 0 under either the highest is actually
8719      * 0, or if the list is empty.  If this distinction matters to you, check
8720      * for emptiness before calling this function */
8721
8722     UV len = _invlist_len(invlist);
8723     UV *array;
8724
8725     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8726
8727     if (len == 0) {
8728         return 0;
8729     }
8730
8731     array = invlist_array(invlist);
8732
8733     /* The last element in the array in the inversion list always starts a
8734      * range that goes to infinity.  That range may be for code points that are
8735      * matched in the inversion list, or it may be for ones that aren't
8736      * matched.  In the latter case, the highest code point in the set is one
8737      * less than the beginning of this range; otherwise it is the final element
8738      * of this range: infinity */
8739     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8740            ? UV_MAX
8741            : array[len - 1] - 1;
8742 }
8743
8744 #ifndef PERL_IN_XSUB_RE
8745 SV *
8746 Perl__invlist_contents(pTHX_ SV* const invlist)
8747 {
8748     /* Get the contents of an inversion list into a string SV so that they can
8749      * be printed out.  It uses the format traditionally done for debug tracing
8750      */
8751
8752     UV start, end;
8753     SV* output = newSVpvs("\n");
8754
8755     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8756
8757     assert(! invlist_is_iterating(invlist));
8758
8759     invlist_iterinit(invlist);
8760     while (invlist_iternext(invlist, &start, &end)) {
8761         if (end == UV_MAX) {
8762             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8763         }
8764         else if (end != start) {
8765             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8766                     start,       end);
8767         }
8768         else {
8769             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8770         }
8771     }
8772
8773     return output;
8774 }
8775 #endif
8776
8777 #ifndef PERL_IN_XSUB_RE
8778 void
8779 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8780 {
8781     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8782      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8783      * the string 'indent'.  The output looks like this:
8784          [0] 0x000A .. 0x000D
8785          [2] 0x0085
8786          [4] 0x2028 .. 0x2029
8787          [6] 0x3104 .. INFINITY
8788      * This means that the first range of code points matched by the list are
8789      * 0xA through 0xD; the second range contains only the single code point
8790      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8791      * are used to define each range (except if the final range extends to
8792      * infinity, only a single element is needed).  The array index of the
8793      * first element for the corresponding range is given in brackets. */
8794
8795     UV start, end;
8796     STRLEN count = 0;
8797
8798     PERL_ARGS_ASSERT__INVLIST_DUMP;
8799
8800     if (invlist_is_iterating(invlist)) {
8801         Perl_dump_indent(aTHX_ level, file,
8802              "%sCan't dump inversion list because is in middle of iterating\n",
8803              indent);
8804         return;
8805     }
8806
8807     invlist_iterinit(invlist);
8808     while (invlist_iternext(invlist, &start, &end)) {
8809         if (end == UV_MAX) {
8810             Perl_dump_indent(aTHX_ level, file,
8811                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8812                                    indent, (UV)count, start);
8813         }
8814         else if (end != start) {
8815             Perl_dump_indent(aTHX_ level, file,
8816                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8817                                 indent, (UV)count, start,         end);
8818         }
8819         else {
8820             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8821                                             indent, (UV)count, start);
8822         }
8823         count += 2;
8824     }
8825 }
8826 #endif
8827
8828 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8829 bool
8830 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8831 {
8832     /* Return a boolean as to if the two passed in inversion lists are
8833      * identical.  The final argument, if TRUE, says to take the complement of
8834      * the second inversion list before doing the comparison */
8835
8836     const UV* array_a = invlist_array(a);
8837     const UV* array_b = invlist_array(b);
8838     UV len_a = _invlist_len(a);
8839     UV len_b = _invlist_len(b);
8840
8841     UV i = 0;               /* current index into the arrays */
8842     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8843
8844     PERL_ARGS_ASSERT__INVLISTEQ;
8845
8846     /* If are to compare 'a' with the complement of b, set it
8847      * up so are looking at b's complement. */
8848     if (complement_b) {
8849
8850         /* The complement of nothing is everything, so <a> would have to have
8851          * just one element, starting at zero (ending at infinity) */
8852         if (len_b == 0) {
8853             return (len_a == 1 && array_a[0] == 0);
8854         }
8855         else if (array_b[0] == 0) {
8856
8857             /* Otherwise, to complement, we invert.  Here, the first element is
8858              * 0, just remove it.  To do this, we just pretend the array starts
8859              * one later */
8860
8861             array_b++;
8862             len_b--;
8863         }
8864         else {
8865
8866             /* But if the first element is not zero, we pretend the list starts
8867              * at the 0 that is always stored immediately before the array. */
8868             array_b--;
8869             len_b++;
8870         }
8871     }
8872
8873     /* Make sure that the lengths are the same, as well as the final element
8874      * before looping through the remainder.  (Thus we test the length, final,
8875      * and first elements right off the bat) */
8876     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8877         retval = FALSE;
8878     }
8879     else for (i = 0; i < len_a - 1; i++) {
8880         if (array_a[i] != array_b[i]) {
8881             retval = FALSE;
8882             break;
8883         }
8884     }
8885
8886     return retval;
8887 }
8888 #endif
8889
8890 #undef HEADER_LENGTH
8891 #undef TO_INTERNAL_SIZE
8892 #undef FROM_INTERNAL_SIZE
8893 #undef INVLIST_VERSION_ID
8894
8895 /* End of inversion list object */
8896
8897 STATIC void
8898 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8899 {
8900     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8901      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8902      * should point to the first flag; it is updated on output to point to the
8903      * final ')' or ':'.  There needs to be at least one flag, or this will
8904      * abort */
8905
8906     /* for (?g), (?gc), and (?o) warnings; warning
8907        about (?c) will warn about (?g) -- japhy    */
8908
8909 #define WASTED_O  0x01
8910 #define WASTED_G  0x02
8911 #define WASTED_C  0x04
8912 #define WASTED_GC (WASTED_G|WASTED_C)
8913     I32 wastedflags = 0x00;
8914     U32 posflags = 0, negflags = 0;
8915     U32 *flagsp = &posflags;
8916     char has_charset_modifier = '\0';
8917     regex_charset cs;
8918     bool has_use_defaults = FALSE;
8919     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8920
8921     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8922
8923     /* '^' as an initial flag sets certain defaults */
8924     if (UCHARAT(RExC_parse) == '^') {
8925         RExC_parse++;
8926         has_use_defaults = TRUE;
8927         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8928         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8929                                         ? REGEX_UNICODE_CHARSET
8930                                         : REGEX_DEPENDS_CHARSET);
8931     }
8932
8933     cs = get_regex_charset(RExC_flags);
8934     if (cs == REGEX_DEPENDS_CHARSET
8935         && (RExC_utf8 || RExC_uni_semantics))
8936     {
8937         cs = REGEX_UNICODE_CHARSET;
8938     }
8939
8940     while (*RExC_parse) {
8941         /* && strchr("iogcmsx", *RExC_parse) */
8942         /* (?g), (?gc) and (?o) are useless here
8943            and must be globally applied -- japhy */
8944         switch (*RExC_parse) {
8945
8946             /* Code for the imsx flags */
8947             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8948
8949             case LOCALE_PAT_MOD:
8950                 if (has_charset_modifier) {
8951                     goto excess_modifier;
8952                 }
8953                 else if (flagsp == &negflags) {
8954                     goto neg_modifier;
8955                 }
8956                 cs = REGEX_LOCALE_CHARSET;
8957                 has_charset_modifier = LOCALE_PAT_MOD;
8958                 RExC_contains_locale = 1;
8959                 break;
8960             case UNICODE_PAT_MOD:
8961                 if (has_charset_modifier) {
8962                     goto excess_modifier;
8963                 }
8964                 else if (flagsp == &negflags) {
8965                     goto neg_modifier;
8966                 }
8967                 cs = REGEX_UNICODE_CHARSET;
8968                 has_charset_modifier = UNICODE_PAT_MOD;
8969                 break;
8970             case ASCII_RESTRICT_PAT_MOD:
8971                 if (flagsp == &negflags) {
8972                     goto neg_modifier;
8973                 }
8974                 if (has_charset_modifier) {
8975                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8976                         goto excess_modifier;
8977                     }
8978                     /* Doubled modifier implies more restricted */
8979                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8980                 }
8981                 else {
8982                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8983                 }
8984                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8985                 break;
8986             case DEPENDS_PAT_MOD:
8987                 if (has_use_defaults) {
8988                     goto fail_modifiers;
8989                 }
8990                 else if (flagsp == &negflags) {
8991                     goto neg_modifier;
8992                 }
8993                 else if (has_charset_modifier) {
8994                     goto excess_modifier;
8995                 }
8996
8997                 /* The dual charset means unicode semantics if the
8998                  * pattern (or target, not known until runtime) are
8999                  * utf8, or something in the pattern indicates unicode
9000                  * semantics */
9001                 cs = (RExC_utf8 || RExC_uni_semantics)
9002                      ? REGEX_UNICODE_CHARSET
9003                      : REGEX_DEPENDS_CHARSET;
9004                 has_charset_modifier = DEPENDS_PAT_MOD;
9005                 break;
9006             excess_modifier:
9007                 RExC_parse++;
9008                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9009                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9010                 }
9011                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9012                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9013                 }
9014                 else {
9015                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9016                 }
9017                 /*NOTREACHED*/
9018             neg_modifier:
9019                 RExC_parse++;
9020                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9021                 /*NOTREACHED*/
9022             case ONCE_PAT_MOD: /* 'o' */
9023             case GLOBAL_PAT_MOD: /* 'g' */
9024                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9025                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9026                     if (! (wastedflags & wflagbit) ) {
9027                         wastedflags |= wflagbit;
9028                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9029                         vWARN5(
9030                             RExC_parse + 1,
9031                             "Useless (%s%c) - %suse /%c modifier",
9032                             flagsp == &negflags ? "?-" : "?",
9033                             *RExC_parse,
9034                             flagsp == &negflags ? "don't " : "",
9035                             *RExC_parse
9036                         );
9037                     }
9038                 }
9039                 break;
9040
9041             case CONTINUE_PAT_MOD: /* 'c' */
9042                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9043                     if (! (wastedflags & WASTED_C) ) {
9044                         wastedflags |= WASTED_GC;
9045                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9046                         vWARN3(
9047                             RExC_parse + 1,
9048                             "Useless (%sc) - %suse /gc modifier",
9049                             flagsp == &negflags ? "?-" : "?",
9050                             flagsp == &negflags ? "don't " : ""
9051                         );
9052                     }
9053                 }
9054                 break;
9055             case KEEPCOPY_PAT_MOD: /* 'p' */
9056                 if (flagsp == &negflags) {
9057                     if (SIZE_ONLY)
9058                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9059                 } else {
9060                     *flagsp |= RXf_PMf_KEEPCOPY;
9061                 }
9062                 break;
9063             case '-':
9064                 /* A flag is a default iff it is following a minus, so
9065                  * if there is a minus, it means will be trying to
9066                  * re-specify a default which is an error */
9067                 if (has_use_defaults || flagsp == &negflags) {
9068                     goto fail_modifiers;
9069                 }
9070                 flagsp = &negflags;
9071                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9072                 break;
9073             case ':':
9074             case ')':
9075                 RExC_flags |= posflags;
9076                 RExC_flags &= ~negflags;
9077                 set_regex_charset(&RExC_flags, cs);
9078                 if (RExC_flags & RXf_PMf_FOLD) {
9079                     RExC_contains_i = 1;
9080                 }
9081                 return;
9082                 /*NOTREACHED*/
9083             default:
9084             fail_modifiers:
9085                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9086                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9087                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9088                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9089                 /*NOTREACHED*/
9090         }
9091
9092         ++RExC_parse;
9093     }
9094 }
9095
9096 /*
9097  - reg - regular expression, i.e. main body or parenthesized thing
9098  *
9099  * Caller must absorb opening parenthesis.
9100  *
9101  * Combining parenthesis handling with the base level of regular expression
9102  * is a trifle forced, but the need to tie the tails of the branches to what
9103  * follows makes it hard to avoid.
9104  */
9105 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9106 #ifdef DEBUGGING
9107 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9108 #else
9109 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9110 #endif
9111
9112 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9113    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9114    needs to be restarted.
9115    Otherwise would only return NULL if regbranch() returns NULL, which
9116    cannot happen.  */
9117 STATIC regnode *
9118 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9119     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9120      * 2 is like 1, but indicates that nextchar() has been called to advance
9121      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9122      * this flag alerts us to the need to check for that */
9123 {
9124     dVAR;
9125     regnode *ret;               /* Will be the head of the group. */
9126     regnode *br;
9127     regnode *lastbr;
9128     regnode *ender = NULL;
9129     I32 parno = 0;
9130     I32 flags;
9131     U32 oregflags = RExC_flags;
9132     bool have_branch = 0;
9133     bool is_open = 0;
9134     I32 freeze_paren = 0;
9135     I32 after_freeze = 0;
9136
9137     char * parse_start = RExC_parse; /* MJD */
9138     char * const oregcomp_parse = RExC_parse;
9139
9140     GET_RE_DEBUG_FLAGS_DECL;
9141
9142     PERL_ARGS_ASSERT_REG;
9143     DEBUG_PARSE("reg ");
9144
9145     *flagp = 0;                         /* Tentatively. */
9146
9147
9148     /* Make an OPEN node, if parenthesized. */
9149     if (paren) {
9150
9151         /* Under /x, space and comments can be gobbled up between the '(' and
9152          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9153          * intervening space, as the sequence is a token, and a token should be
9154          * indivisible */
9155         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9156
9157         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9158             char *start_verb = RExC_parse;
9159             STRLEN verb_len = 0;
9160             char *start_arg = NULL;
9161             unsigned char op = 0;
9162             int argok = 1;
9163             int internal_argval = 0; /* internal_argval is only useful if !argok */
9164
9165             if (has_intervening_patws && SIZE_ONLY) {
9166                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9167             }
9168             while ( *RExC_parse && *RExC_parse != ')' ) {
9169                 if ( *RExC_parse == ':' ) {
9170                     start_arg = RExC_parse + 1;
9171                     break;
9172                 }
9173                 RExC_parse++;
9174             }
9175             ++start_verb;
9176             verb_len = RExC_parse - start_verb;
9177             if ( start_arg ) {
9178                 RExC_parse++;
9179                 while ( *RExC_parse && *RExC_parse != ')' )
9180                     RExC_parse++;
9181                 if ( *RExC_parse != ')' )
9182                     vFAIL("Unterminated verb pattern argument");
9183                 if ( RExC_parse == start_arg )
9184                     start_arg = NULL;
9185             } else {
9186                 if ( *RExC_parse != ')' )
9187                     vFAIL("Unterminated verb pattern");
9188             }
9189
9190             switch ( *start_verb ) {
9191             case 'A':  /* (*ACCEPT) */
9192                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9193                     op = ACCEPT;
9194                     internal_argval = RExC_nestroot;
9195                 }
9196                 break;
9197             case 'C':  /* (*COMMIT) */
9198                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9199                     op = COMMIT;
9200                 break;
9201             case 'F':  /* (*FAIL) */
9202                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9203                     op = OPFAIL;
9204                     argok = 0;
9205                 }
9206                 break;
9207             case ':':  /* (*:NAME) */
9208             case 'M':  /* (*MARK:NAME) */
9209                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9210                     op = MARKPOINT;
9211                     argok = -1;
9212                 }
9213                 break;
9214             case 'P':  /* (*PRUNE) */
9215                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9216                     op = PRUNE;
9217                 break;
9218             case 'S':   /* (*SKIP) */
9219                 if ( memEQs(start_verb,verb_len,"SKIP") )
9220                     op = SKIP;
9221                 break;
9222             case 'T':  /* (*THEN) */
9223                 /* [19:06] <TimToady> :: is then */
9224                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9225                     op = CUTGROUP;
9226                     RExC_seen |= REG_SEEN_CUTGROUP;
9227                 }
9228                 break;
9229             }
9230             if ( ! op ) {
9231                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9232                 vFAIL2utf8f(
9233                     "Unknown verb pattern '%"UTF8f"'",
9234                     UTF8fARG(UTF, verb_len, start_verb));
9235             }
9236             if ( argok ) {
9237                 if ( start_arg && internal_argval ) {
9238                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9239                         verb_len, start_verb);
9240                 } else if ( argok < 0 && !start_arg ) {
9241                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9242                         verb_len, start_verb);
9243                 } else {
9244                     ret = reganode(pRExC_state, op, internal_argval);
9245                     if ( ! internal_argval && ! SIZE_ONLY ) {
9246                         if (start_arg) {
9247                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9248                             ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9249                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9250                             ret->flags = 0;
9251                         } else {
9252                             ret->flags = 1;
9253                         }
9254                     }
9255                 }
9256                 if (!internal_argval)
9257                     RExC_seen |= REG_SEEN_VERBARG;
9258             } else if ( start_arg ) {
9259                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9260                         verb_len, start_verb);
9261             } else {
9262                 ret = reg_node(pRExC_state, op);
9263             }
9264             nextchar(pRExC_state);
9265             return ret;
9266         }
9267         else if (*RExC_parse == '?') { /* (?...) */
9268             bool is_logical = 0;
9269             const char * const seqstart = RExC_parse;
9270             if (has_intervening_patws && SIZE_ONLY) {
9271                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9272             }
9273
9274             RExC_parse++;
9275             paren = *RExC_parse++;
9276             ret = NULL;                 /* For look-ahead/behind. */
9277             switch (paren) {
9278
9279             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9280                 paren = *RExC_parse++;
9281                 if ( paren == '<')         /* (?P<...>) named capture */
9282                     goto named_capture;
9283                 else if (paren == '>') {   /* (?P>name) named recursion */
9284                     goto named_recursion;
9285                 }
9286                 else if (paren == '=') {   /* (?P=...)  named backref */
9287                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
9288                        you change this make sure you change that */
9289                     char* name_start = RExC_parse;
9290                     U32 num = 0;
9291                     SV *sv_dat = reg_scan_name(pRExC_state,
9292                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9293                     if (RExC_parse == name_start || *RExC_parse != ')')
9294                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9295                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9296
9297                     if (!SIZE_ONLY) {
9298                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9299                         RExC_rxi->data->data[num]=(void*)sv_dat;
9300                         SvREFCNT_inc_simple_void(sv_dat);
9301                     }
9302                     RExC_sawback = 1;
9303                     ret = reganode(pRExC_state,
9304                                    ((! FOLD)
9305                                      ? NREF
9306                                      : (ASCII_FOLD_RESTRICTED)
9307                                        ? NREFFA
9308                                        : (AT_LEAST_UNI_SEMANTICS)
9309                                          ? NREFFU
9310                                          : (LOC)
9311                                            ? NREFFL
9312                                            : NREFF),
9313                                     num);
9314                     *flagp |= HASWIDTH;
9315
9316                     Set_Node_Offset(ret, parse_start+1);
9317                     Set_Node_Cur_Length(ret, parse_start);
9318
9319                     nextchar(pRExC_state);
9320                     return ret;
9321                 }
9322                 RExC_parse++;
9323                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9324                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9325                 /*NOTREACHED*/
9326             case '<':           /* (?<...) */
9327                 if (*RExC_parse == '!')
9328                     paren = ',';
9329                 else if (*RExC_parse != '=')
9330               named_capture:
9331                 {               /* (?<...>) */
9332                     char *name_start;
9333                     SV *svname;
9334                     paren= '>';
9335             case '\'':          /* (?'...') */
9336                     name_start= RExC_parse;
9337                     svname = reg_scan_name(pRExC_state,
9338                         SIZE_ONLY ?  /* reverse test from the others */
9339                         REG_RSN_RETURN_NAME :
9340                         REG_RSN_RETURN_NULL);
9341                     if (RExC_parse == name_start || *RExC_parse != paren)
9342                         vFAIL2("Sequence (?%c... not terminated",
9343                             paren=='>' ? '<' : paren);
9344                     if (SIZE_ONLY) {
9345                         HE *he_str;
9346                         SV *sv_dat = NULL;
9347                         if (!svname) /* shouldn't happen */
9348                             Perl_croak(aTHX_
9349                                 "panic: reg_scan_name returned NULL");
9350                         if (!RExC_paren_names) {
9351                             RExC_paren_names= newHV();
9352                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9353 #ifdef DEBUGGING
9354                             RExC_paren_name_list= newAV();
9355                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9356 #endif
9357                         }
9358                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9359                         if ( he_str )
9360                             sv_dat = HeVAL(he_str);
9361                         if ( ! sv_dat ) {
9362                             /* croak baby croak */
9363                             Perl_croak(aTHX_
9364                                 "panic: paren_name hash element allocation failed");
9365                         } else if ( SvPOK(sv_dat) ) {
9366                             /* (?|...) can mean we have dupes so scan to check
9367                                its already been stored. Maybe a flag indicating
9368                                we are inside such a construct would be useful,
9369                                but the arrays are likely to be quite small, so
9370                                for now we punt -- dmq */
9371                             IV count = SvIV(sv_dat);
9372                             I32 *pv = (I32*)SvPVX(sv_dat);
9373                             IV i;
9374                             for ( i = 0 ; i < count ; i++ ) {
9375                                 if ( pv[i] == RExC_npar ) {
9376                                     count = 0;
9377                                     break;
9378                                 }
9379                             }
9380                             if ( count ) {
9381                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9382                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9383                                 pv[count] = RExC_npar;
9384                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9385                             }
9386                         } else {
9387                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9388                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9389                             SvIOK_on(sv_dat);
9390                             SvIV_set(sv_dat, 1);
9391                         }
9392 #ifdef DEBUGGING
9393                         /* Yes this does cause a memory leak in debugging Perls */
9394                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9395                             SvREFCNT_dec_NN(svname);
9396 #endif
9397
9398                         /*sv_dump(sv_dat);*/
9399                     }
9400                     nextchar(pRExC_state);
9401                     paren = 1;
9402                     goto capturing_parens;
9403                 }
9404                 RExC_seen |= REG_SEEN_LOOKBEHIND;
9405                 RExC_in_lookbehind++;
9406                 RExC_parse++;
9407             case '=':           /* (?=...) */
9408                 RExC_seen_zerolen++;
9409                 break;
9410             case '!':           /* (?!...) */
9411                 RExC_seen_zerolen++;
9412                 if (*RExC_parse == ')') {
9413                     ret=reg_node(pRExC_state, OPFAIL);
9414                     nextchar(pRExC_state);
9415                     return ret;
9416                 }
9417                 break;
9418             case '|':           /* (?|...) */
9419                 /* branch reset, behave like a (?:...) except that
9420                    buffers in alternations share the same numbers */
9421                 paren = ':';
9422                 after_freeze = freeze_paren = RExC_npar;
9423                 break;
9424             case ':':           /* (?:...) */
9425             case '>':           /* (?>...) */
9426                 break;
9427             case '$':           /* (?$...) */
9428             case '@':           /* (?@...) */
9429                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9430                 break;
9431             case '#':           /* (?#...) */
9432                 /* XXX As soon as we disallow separating the '?' and '*' (by
9433                  * spaces or (?#...) comment), it is believed that this case
9434                  * will be unreachable and can be removed.  See
9435                  * [perl #117327] */
9436                 while (*RExC_parse && *RExC_parse != ')')
9437                     RExC_parse++;
9438                 if (*RExC_parse != ')')
9439                     FAIL("Sequence (?#... not terminated");
9440                 nextchar(pRExC_state);
9441                 *flagp = TRYAGAIN;
9442                 return NULL;
9443             case '0' :           /* (?0) */
9444             case 'R' :           /* (?R) */
9445                 if (*RExC_parse != ')')
9446                     FAIL("Sequence (?R) not terminated");
9447                 ret = reg_node(pRExC_state, GOSTART);
9448                     RExC_seen |= REG_SEEN_GOSTART;
9449                 *flagp |= POSTPONED;
9450                 nextchar(pRExC_state);
9451                 return ret;
9452                 /*notreached*/
9453             { /* named and numeric backreferences */
9454                 I32 num;
9455             case '&':            /* (?&NAME) */
9456                 parse_start = RExC_parse - 1;
9457               named_recursion:
9458                 {
9459                     SV *sv_dat = reg_scan_name(pRExC_state,
9460                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9461                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9462                 }
9463                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9464                     vFAIL("Sequence (?&... not terminated");
9465                 goto gen_recurse_regop;
9466                 assert(0); /* NOT REACHED */
9467             case '+':
9468                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9469                     RExC_parse++;
9470                     vFAIL("Illegal pattern");
9471                 }
9472                 goto parse_recursion;
9473                 /* NOT REACHED*/
9474             case '-': /* (?-1) */
9475                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9476                     RExC_parse--; /* rewind to let it be handled later */
9477                     goto parse_flags;
9478                 }
9479                 /*FALLTHROUGH */
9480             case '1': case '2': case '3': case '4': /* (?1) */
9481             case '5': case '6': case '7': case '8': case '9':
9482                 RExC_parse--;
9483               parse_recursion:
9484                 num = atoi(RExC_parse);
9485                 parse_start = RExC_parse - 1; /* MJD */
9486                 if (*RExC_parse == '-')
9487                     RExC_parse++;
9488                 while (isDIGIT(*RExC_parse))
9489                         RExC_parse++;
9490                 if (*RExC_parse!=')')
9491                     vFAIL("Expecting close bracket");
9492
9493               gen_recurse_regop:
9494                 if ( paren == '-' ) {
9495                     /*
9496                     Diagram of capture buffer numbering.
9497                     Top line is the normal capture buffer numbers
9498                     Bottom line is the negative indexing as from
9499                     the X (the (?-2))
9500
9501                     +   1 2    3 4 5 X          6 7
9502                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9503                     -   5 4    3 2 1 X          x x
9504
9505                     */
9506                     num = RExC_npar + num;
9507                     if (num < 1)  {
9508                         RExC_parse++;
9509                         vFAIL("Reference to nonexistent group");
9510                     }
9511                 } else if ( paren == '+' ) {
9512                     num = RExC_npar + num - 1;
9513                 }
9514
9515                 ret = reganode(pRExC_state, GOSUB, num);
9516                 if (!SIZE_ONLY) {
9517                     if (num > (I32)RExC_rx->nparens) {
9518                         RExC_parse++;
9519                         vFAIL("Reference to nonexistent group");
9520                     }
9521                     ARG2L_SET( ret, RExC_recurse_count++);
9522                     RExC_emit++;
9523                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9524                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9525                 } else {
9526                     RExC_size++;
9527                 }
9528                 RExC_seen |= REG_SEEN_RECURSE;
9529                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9530                 Set_Node_Offset(ret, parse_start); /* MJD */
9531
9532                 *flagp |= POSTPONED;
9533                 nextchar(pRExC_state);
9534                 return ret;
9535             } /* named and numeric backreferences */
9536             assert(0); /* NOT REACHED */
9537
9538             case '?':           /* (??...) */
9539                 is_logical = 1;
9540                 if (*RExC_parse != '{') {
9541                     RExC_parse++;
9542                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9543                     vFAIL2utf8f(
9544                         "Sequence (%"UTF8f"...) not recognized",
9545                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9546                     /*NOTREACHED*/
9547                 }
9548                 *flagp |= POSTPONED;
9549                 paren = *RExC_parse++;
9550                 /* FALL THROUGH */
9551             case '{':           /* (?{...}) */
9552             {
9553                 U32 n = 0;
9554                 struct reg_code_block *cb;
9555
9556                 RExC_seen_zerolen++;
9557
9558                 if (   !pRExC_state->num_code_blocks
9559                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9560                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9561                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9562                             - RExC_start)
9563                 ) {
9564                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9565                         FAIL("panic: Sequence (?{...}): no code block found\n");
9566                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9567                 }
9568                 /* this is a pre-compiled code block (?{...}) */
9569                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9570                 RExC_parse = RExC_start + cb->end;
9571                 if (!SIZE_ONLY) {
9572                     OP *o = cb->block;
9573                     if (cb->src_regex) {
9574                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9575                         RExC_rxi->data->data[n] =
9576                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9577                         RExC_rxi->data->data[n+1] = (void*)o;
9578                     }
9579                     else {
9580                         n = add_data(pRExC_state,
9581                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9582                         RExC_rxi->data->data[n] = (void*)o;
9583                     }
9584                 }
9585                 pRExC_state->code_index++;
9586                 nextchar(pRExC_state);
9587
9588                 if (is_logical) {
9589                     regnode *eval;
9590                     ret = reg_node(pRExC_state, LOGICAL);
9591                     eval = reganode(pRExC_state, EVAL, n);
9592                     if (!SIZE_ONLY) {
9593                         ret->flags = 2;
9594                         /* for later propagation into (??{}) return value */
9595                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9596                     }
9597                     REGTAIL(pRExC_state, ret, eval);
9598                     /* deal with the length of this later - MJD */
9599                     return ret;
9600                 }
9601                 ret = reganode(pRExC_state, EVAL, n);
9602                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9603                 Set_Node_Offset(ret, parse_start);
9604                 return ret;
9605             }
9606             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9607             {
9608                 int is_define= 0;
9609                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9610                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9611                         || RExC_parse[1] == '<'
9612                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9613                         I32 flag;
9614                         regnode *tail;
9615
9616                         ret = reg_node(pRExC_state, LOGICAL);
9617                         if (!SIZE_ONLY)
9618                             ret->flags = 1;
9619
9620                         tail = reg(pRExC_state, 1, &flag, depth+1);
9621                         if (flag & RESTART_UTF8) {
9622                             *flagp = RESTART_UTF8;
9623                             return NULL;
9624                         }
9625                         REGTAIL(pRExC_state, ret, tail);
9626                         goto insert_if;
9627                     }
9628                 }
9629                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9630                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9631                 {
9632                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9633                     char *name_start= RExC_parse++;
9634                     U32 num = 0;
9635                     SV *sv_dat=reg_scan_name(pRExC_state,
9636                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9637                     if (RExC_parse == name_start || *RExC_parse != ch)
9638                         vFAIL2("Sequence (?(%c... not terminated",
9639                             (ch == '>' ? '<' : ch));
9640                     RExC_parse++;
9641                     if (!SIZE_ONLY) {
9642                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9643                         RExC_rxi->data->data[num]=(void*)sv_dat;
9644                         SvREFCNT_inc_simple_void(sv_dat);
9645                     }
9646                     ret = reganode(pRExC_state,NGROUPP,num);
9647                     goto insert_if_check_paren;
9648                 }
9649                 else if (RExC_parse[0] == 'D' &&
9650                          RExC_parse[1] == 'E' &&
9651                          RExC_parse[2] == 'F' &&
9652                          RExC_parse[3] == 'I' &&
9653                          RExC_parse[4] == 'N' &&
9654                          RExC_parse[5] == 'E')
9655                 {
9656                     ret = reganode(pRExC_state,DEFINEP,0);
9657                     RExC_parse +=6 ;
9658                     is_define = 1;
9659                     goto insert_if_check_paren;
9660                 }
9661                 else if (RExC_parse[0] == 'R') {
9662                     RExC_parse++;
9663                     parno = 0;
9664                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9665                         parno = atoi(RExC_parse++);
9666                         while (isDIGIT(*RExC_parse))
9667                             RExC_parse++;
9668                     } else if (RExC_parse[0] == '&') {
9669                         SV *sv_dat;
9670                         RExC_parse++;
9671                         sv_dat = reg_scan_name(pRExC_state,
9672                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9673                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9674                     }
9675                     ret = reganode(pRExC_state,INSUBP,parno);
9676                     goto insert_if_check_paren;
9677                 }
9678                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9679                     /* (?(1)...) */
9680                     char c;
9681                     char *tmp;
9682                     parno = atoi(RExC_parse++);
9683
9684                     while (isDIGIT(*RExC_parse))
9685                         RExC_parse++;
9686                     ret = reganode(pRExC_state, GROUPP, parno);
9687
9688                  insert_if_check_paren:
9689                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9690                         /* nextchar also skips comments, so undo its work
9691                          * and skip over the the next character.
9692                          */
9693                         RExC_parse = tmp;
9694                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9695                         vFAIL("Switch condition not recognized");
9696                     }
9697                   insert_if:
9698                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9699                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9700                     if (br == NULL) {
9701                         if (flags & RESTART_UTF8) {
9702                             *flagp = RESTART_UTF8;
9703                             return NULL;
9704                         }
9705                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9706                               (UV) flags);
9707                     } else
9708                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9709                     c = *nextchar(pRExC_state);
9710                     if (flags&HASWIDTH)
9711                         *flagp |= HASWIDTH;
9712                     if (c == '|') {
9713                         if (is_define)
9714                             vFAIL("(?(DEFINE)....) does not allow branches");
9715                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9716                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9717                             if (flags & RESTART_UTF8) {
9718                                 *flagp = RESTART_UTF8;
9719                                 return NULL;
9720                             }
9721                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9722                                   (UV) flags);
9723                         }
9724                         REGTAIL(pRExC_state, ret, lastbr);
9725                         if (flags&HASWIDTH)
9726                             *flagp |= HASWIDTH;
9727                         c = *nextchar(pRExC_state);
9728                     }
9729                     else
9730                         lastbr = NULL;
9731                     if (c != ')')
9732                         vFAIL("Switch (?(condition)... contains too many branches");
9733                     ender = reg_node(pRExC_state, TAIL);
9734                     REGTAIL(pRExC_state, br, ender);
9735                     if (lastbr) {
9736                         REGTAIL(pRExC_state, lastbr, ender);
9737                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9738                     }
9739                     else
9740                         REGTAIL(pRExC_state, ret, ender);
9741                     RExC_size++; /* XXX WHY do we need this?!!
9742                                     For large programs it seems to be required
9743                                     but I can't figure out why. -- dmq*/
9744                     return ret;
9745                 }
9746                 else {
9747                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9748                     vFAIL("Unknown switch condition (?(...))");
9749                 }
9750             }
9751             case '[':           /* (?[ ... ]) */
9752                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9753                                          oregcomp_parse);
9754             case 0:
9755                 RExC_parse--; /* for vFAIL to print correctly */
9756                 vFAIL("Sequence (? incomplete");
9757                 break;
9758             default: /* e.g., (?i) */
9759                 --RExC_parse;
9760               parse_flags:
9761                 parse_lparen_question_flags(pRExC_state);
9762                 if (UCHARAT(RExC_parse) != ':') {
9763                     nextchar(pRExC_state);
9764                     *flagp = TRYAGAIN;
9765                     return NULL;
9766                 }
9767                 paren = ':';
9768                 nextchar(pRExC_state);
9769                 ret = NULL;
9770                 goto parse_rest;
9771             } /* end switch */
9772         }
9773         else {                  /* (...) */
9774           capturing_parens:
9775             parno = RExC_npar;
9776             RExC_npar++;
9777
9778             ret = reganode(pRExC_state, OPEN, parno);
9779             if (!SIZE_ONLY ){
9780                 if (!RExC_nestroot)
9781                     RExC_nestroot = parno;
9782                 if (RExC_seen & REG_SEEN_RECURSE
9783                     && !RExC_open_parens[parno-1])
9784                 {
9785                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9786                         "Setting open paren #%"IVdf" to %d\n",
9787                         (IV)parno, REG_NODE_NUM(ret)));
9788                     RExC_open_parens[parno-1]= ret;
9789                 }
9790             }
9791             Set_Node_Length(ret, 1); /* MJD */
9792             Set_Node_Offset(ret, RExC_parse); /* MJD */
9793             is_open = 1;
9794         }
9795     }
9796     else                        /* ! paren */
9797         ret = NULL;
9798
9799    parse_rest:
9800     /* Pick up the branches, linking them together. */
9801     parse_start = RExC_parse;   /* MJD */
9802     br = regbranch(pRExC_state, &flags, 1,depth+1);
9803
9804     /*     branch_len = (paren != 0); */
9805
9806     if (br == NULL) {
9807         if (flags & RESTART_UTF8) {
9808             *flagp = RESTART_UTF8;
9809             return NULL;
9810         }
9811         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9812     }
9813     if (*RExC_parse == '|') {
9814         if (!SIZE_ONLY && RExC_extralen) {
9815             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9816         }
9817         else {                  /* MJD */
9818             reginsert(pRExC_state, BRANCH, br, depth+1);
9819             Set_Node_Length(br, paren != 0);
9820             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9821         }
9822         have_branch = 1;
9823         if (SIZE_ONLY)
9824             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9825     }
9826     else if (paren == ':') {
9827         *flagp |= flags&SIMPLE;
9828     }
9829     if (is_open) {                              /* Starts with OPEN. */
9830         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9831     }
9832     else if (paren != '?')              /* Not Conditional */
9833         ret = br;
9834     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9835     lastbr = br;
9836     while (*RExC_parse == '|') {
9837         if (!SIZE_ONLY && RExC_extralen) {
9838             ender = reganode(pRExC_state, LONGJMP,0);
9839             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9840         }
9841         if (SIZE_ONLY)
9842             RExC_extralen += 2;         /* Account for LONGJMP. */
9843         nextchar(pRExC_state);
9844         if (freeze_paren) {
9845             if (RExC_npar > after_freeze)
9846                 after_freeze = RExC_npar;
9847             RExC_npar = freeze_paren;
9848         }
9849         br = regbranch(pRExC_state, &flags, 0, depth+1);
9850
9851         if (br == NULL) {
9852             if (flags & RESTART_UTF8) {
9853                 *flagp = RESTART_UTF8;
9854                 return NULL;
9855             }
9856             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9857         }
9858         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9859         lastbr = br;
9860         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9861     }
9862
9863     if (have_branch || paren != ':') {
9864         /* Make a closing node, and hook it on the end. */
9865         switch (paren) {
9866         case ':':
9867             ender = reg_node(pRExC_state, TAIL);
9868             break;
9869         case 1: case 2:
9870             ender = reganode(pRExC_state, CLOSE, parno);
9871             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9872                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9873                         "Setting close paren #%"IVdf" to %d\n",
9874                         (IV)parno, REG_NODE_NUM(ender)));
9875                 RExC_close_parens[parno-1]= ender;
9876                 if (RExC_nestroot == parno)
9877                     RExC_nestroot = 0;
9878             }
9879             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9880             Set_Node_Length(ender,1); /* MJD */
9881             break;
9882         case '<':
9883         case ',':
9884         case '=':
9885         case '!':
9886             *flagp &= ~HASWIDTH;
9887             /* FALL THROUGH */
9888         case '>':
9889             ender = reg_node(pRExC_state, SUCCEED);
9890             break;
9891         case 0:
9892             ender = reg_node(pRExC_state, END);
9893             if (!SIZE_ONLY) {
9894                 assert(!RExC_opend); /* there can only be one! */
9895                 RExC_opend = ender;
9896             }
9897             break;
9898         }
9899         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9900             SV * const mysv_val1=sv_newmortal();
9901             SV * const mysv_val2=sv_newmortal();
9902             DEBUG_PARSE_MSG("lsbr");
9903             regprop(RExC_rx, mysv_val1, lastbr);
9904             regprop(RExC_rx, mysv_val2, ender);
9905             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9906                           SvPV_nolen_const(mysv_val1),
9907                           (IV)REG_NODE_NUM(lastbr),
9908                           SvPV_nolen_const(mysv_val2),
9909                           (IV)REG_NODE_NUM(ender),
9910                           (IV)(ender - lastbr)
9911             );
9912         });
9913         REGTAIL(pRExC_state, lastbr, ender);
9914
9915         if (have_branch && !SIZE_ONLY) {
9916             char is_nothing= 1;
9917             if (depth==1)
9918                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9919
9920             /* Hook the tails of the branches to the closing node. */
9921             for (br = ret; br; br = regnext(br)) {
9922                 const U8 op = PL_regkind[OP(br)];
9923                 if (op == BRANCH) {
9924                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9925                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9926                         is_nothing= 0;
9927                 }
9928                 else if (op == BRANCHJ) {
9929                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9930                     /* for now we always disable this optimisation * /
9931                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9932                     */
9933                         is_nothing= 0;
9934                 }
9935             }
9936             if (is_nothing) {
9937                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9938                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9939                     SV * const mysv_val1=sv_newmortal();
9940                     SV * const mysv_val2=sv_newmortal();
9941                     DEBUG_PARSE_MSG("NADA");
9942                     regprop(RExC_rx, mysv_val1, ret);
9943                     regprop(RExC_rx, mysv_val2, ender);
9944                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9945                                   SvPV_nolen_const(mysv_val1),
9946                                   (IV)REG_NODE_NUM(ret),
9947                                   SvPV_nolen_const(mysv_val2),
9948                                   (IV)REG_NODE_NUM(ender),
9949                                   (IV)(ender - ret)
9950                     );
9951                 });
9952                 OP(br)= NOTHING;
9953                 if (OP(ender) == TAIL) {
9954                     NEXT_OFF(br)= 0;
9955                     RExC_emit= br + 1;
9956                 } else {
9957                     regnode *opt;
9958                     for ( opt= br + 1; opt < ender ; opt++ )
9959                         OP(opt)= OPTIMIZED;
9960                     NEXT_OFF(br)= ender - br;
9961                 }
9962             }
9963         }
9964     }
9965
9966     {
9967         const char *p;
9968         static const char parens[] = "=!<,>";
9969
9970         if (paren && (p = strchr(parens, paren))) {
9971             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9972             int flag = (p - parens) > 1;
9973
9974             if (paren == '>')
9975                 node = SUSPEND, flag = 0;
9976             reginsert(pRExC_state, node,ret, depth+1);
9977             Set_Node_Cur_Length(ret, parse_start);
9978             Set_Node_Offset(ret, parse_start + 1);
9979             ret->flags = flag;
9980             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9981         }
9982     }
9983
9984     /* Check for proper termination. */
9985     if (paren) {
9986         /* restore original flags, but keep (?p) */
9987         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9988         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9989             RExC_parse = oregcomp_parse;
9990             vFAIL("Unmatched (");
9991         }
9992     }
9993     else if (!paren && RExC_parse < RExC_end) {
9994         if (*RExC_parse == ')') {
9995             RExC_parse++;
9996             vFAIL("Unmatched )");
9997         }
9998         else
9999             FAIL("Junk on end of regexp");      /* "Can't happen". */
10000         assert(0); /* NOTREACHED */
10001     }
10002
10003     if (RExC_in_lookbehind) {
10004         RExC_in_lookbehind--;
10005     }
10006     if (after_freeze > RExC_npar)
10007         RExC_npar = after_freeze;
10008     return(ret);
10009 }
10010
10011 /*
10012  - regbranch - one alternative of an | operator
10013  *
10014  * Implements the concatenation operator.
10015  *
10016  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10017  * restarted.
10018  */
10019 STATIC regnode *
10020 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10021 {
10022     dVAR;
10023     regnode *ret;
10024     regnode *chain = NULL;
10025     regnode *latest;
10026     I32 flags = 0, c = 0;
10027     GET_RE_DEBUG_FLAGS_DECL;
10028
10029     PERL_ARGS_ASSERT_REGBRANCH;
10030
10031     DEBUG_PARSE("brnc");
10032
10033     if (first)
10034         ret = NULL;
10035     else {
10036         if (!SIZE_ONLY && RExC_extralen)
10037             ret = reganode(pRExC_state, BRANCHJ,0);
10038         else {
10039             ret = reg_node(pRExC_state, BRANCH);
10040             Set_Node_Length(ret, 1);
10041         }
10042     }
10043
10044     if (!first && SIZE_ONLY)
10045         RExC_extralen += 1;                     /* BRANCHJ */
10046
10047     *flagp = WORST;                     /* Tentatively. */
10048
10049     RExC_parse--;
10050     nextchar(pRExC_state);
10051     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10052         flags &= ~TRYAGAIN;
10053         latest = regpiece(pRExC_state, &flags,depth+1);
10054         if (latest == NULL) {
10055             if (flags & TRYAGAIN)
10056                 continue;
10057             if (flags & RESTART_UTF8) {
10058                 *flagp = RESTART_UTF8;
10059                 return NULL;
10060             }
10061             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10062         }
10063         else if (ret == NULL)
10064             ret = latest;
10065         *flagp |= flags&(HASWIDTH|POSTPONED);
10066         if (chain == NULL)      /* First piece. */
10067             *flagp |= flags&SPSTART;
10068         else {
10069             RExC_naughty++;
10070             REGTAIL(pRExC_state, chain, latest);
10071         }
10072         chain = latest;
10073         c++;
10074     }
10075     if (chain == NULL) {        /* Loop ran zero times. */
10076         chain = reg_node(pRExC_state, NOTHING);
10077         if (ret == NULL)
10078             ret = chain;
10079     }
10080     if (c == 1) {
10081         *flagp |= flags&SIMPLE;
10082     }
10083
10084     return ret;
10085 }
10086
10087 /*
10088  - regpiece - something followed by possible [*+?]
10089  *
10090  * Note that the branching code sequences used for ? and the general cases
10091  * of * and + are somewhat optimized:  they use the same NOTHING node as
10092  * both the endmarker for their branch list and the body of the last branch.
10093  * It might seem that this node could be dispensed with entirely, but the
10094  * endmarker role is not redundant.
10095  *
10096  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10097  * TRYAGAIN.
10098  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10099  * restarted.
10100  */
10101 STATIC regnode *
10102 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10103 {
10104     dVAR;
10105     regnode *ret;
10106     char op;
10107     char *next;
10108     I32 flags;
10109     const char * const origparse = RExC_parse;
10110     I32 min;
10111     I32 max = REG_INFTY;
10112 #ifdef RE_TRACK_PATTERN_OFFSETS
10113     char *parse_start;
10114 #endif
10115     const char *maxpos = NULL;
10116
10117     /* Save the original in case we change the emitted regop to a FAIL. */
10118     regnode * const orig_emit = RExC_emit;
10119
10120     GET_RE_DEBUG_FLAGS_DECL;
10121
10122     PERL_ARGS_ASSERT_REGPIECE;
10123
10124     DEBUG_PARSE("piec");
10125
10126     ret = regatom(pRExC_state, &flags,depth+1);
10127     if (ret == NULL) {
10128         if (flags & (TRYAGAIN|RESTART_UTF8))
10129             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10130         else
10131             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10132         return(NULL);
10133     }
10134
10135     op = *RExC_parse;
10136
10137     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10138         maxpos = NULL;
10139 #ifdef RE_TRACK_PATTERN_OFFSETS
10140         parse_start = RExC_parse; /* MJD */
10141 #endif
10142         next = RExC_parse + 1;
10143         while (isDIGIT(*next) || *next == ',') {
10144             if (*next == ',') {
10145                 if (maxpos)
10146                     break;
10147                 else
10148                     maxpos = next;
10149             }
10150             next++;
10151         }
10152         if (*next == '}') {             /* got one */
10153             if (!maxpos)
10154                 maxpos = next;
10155             RExC_parse++;
10156             min = atoi(RExC_parse);
10157             if (*maxpos == ',')
10158                 maxpos++;
10159             else
10160                 maxpos = RExC_parse;
10161             max = atoi(maxpos);
10162             if (!max && *maxpos != '0')
10163                 max = REG_INFTY;                /* meaning "infinity" */
10164             else if (max >= REG_INFTY)
10165                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10166             RExC_parse = next;
10167             nextchar(pRExC_state);
10168             if (max < min) {    /* If can't match, warn and optimize to fail
10169                                    unconditionally */
10170                 if (SIZE_ONLY) {
10171                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10172
10173                     /* We can't back off the size because we have to reserve
10174                      * enough space for all the things we are about to throw
10175                      * away, but we can shrink it by the ammount we are about
10176                      * to re-use here */
10177                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10178                 }
10179                 else {
10180                     RExC_emit = orig_emit;
10181                 }
10182                 ret = reg_node(pRExC_state, OPFAIL);
10183                 return ret;
10184             }
10185
10186         do_curly:
10187             if ((flags&SIMPLE)) {
10188                 RExC_naughty += 2 + RExC_naughty / 2;
10189                 reginsert(pRExC_state, CURLY, ret, depth+1);
10190                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10191                 Set_Node_Cur_Length(ret, parse_start);
10192             }
10193             else {
10194                 regnode * const w = reg_node(pRExC_state, WHILEM);
10195
10196                 w->flags = 0;
10197                 REGTAIL(pRExC_state, ret, w);
10198                 if (!SIZE_ONLY && RExC_extralen) {
10199                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10200                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10201                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10202                 }
10203                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10204                                 /* MJD hk */
10205                 Set_Node_Offset(ret, parse_start+1);
10206                 Set_Node_Length(ret,
10207                                 op == '{' ? (RExC_parse - parse_start) : 1);
10208
10209                 if (!SIZE_ONLY && RExC_extralen)
10210                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10211                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10212                 if (SIZE_ONLY)
10213                     RExC_whilem_seen++, RExC_extralen += 3;
10214                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10215             }
10216             ret->flags = 0;
10217
10218             if (min > 0)
10219                 *flagp = WORST;
10220             if (max > 0)
10221                 *flagp |= HASWIDTH;
10222             if (!SIZE_ONLY) {
10223                 ARG1_SET(ret, (U16)min);
10224                 ARG2_SET(ret, (U16)max);
10225             }
10226
10227             goto nest_check;
10228         }
10229     }
10230
10231     if (!ISMULT1(op)) {
10232         *flagp = flags;
10233         return(ret);
10234     }
10235
10236 #if 0                           /* Now runtime fix should be reliable. */
10237
10238     /* if this is reinstated, don't forget to put this back into perldiag:
10239
10240             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10241
10242            (F) The part of the regexp subject to either the * or + quantifier
10243            could match an empty string. The {#} shows in the regular
10244            expression about where the problem was discovered.
10245
10246     */
10247
10248     if (!(flags&HASWIDTH) && op != '?')
10249       vFAIL("Regexp *+ operand could be empty");
10250 #endif
10251
10252 #ifdef RE_TRACK_PATTERN_OFFSETS
10253     parse_start = RExC_parse;
10254 #endif
10255     nextchar(pRExC_state);
10256
10257     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10258
10259     if (op == '*' && (flags&SIMPLE)) {
10260         reginsert(pRExC_state, STAR, ret, depth+1);
10261         ret->flags = 0;
10262         RExC_naughty += 4;
10263     }
10264     else if (op == '*') {
10265         min = 0;
10266         goto do_curly;
10267     }
10268     else if (op == '+' && (flags&SIMPLE)) {
10269         reginsert(pRExC_state, PLUS, ret, depth+1);
10270         ret->flags = 0;
10271         RExC_naughty += 3;
10272     }
10273     else if (op == '+') {
10274         min = 1;
10275         goto do_curly;
10276     }
10277     else if (op == '?') {
10278         min = 0; max = 1;
10279         goto do_curly;
10280     }
10281   nest_check:
10282     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10283         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10284         ckWARN2reg(RExC_parse,
10285                    "%"UTF8f" matches null string many times",
10286                    UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10287                    origparse));
10288         (void)ReREFCNT_inc(RExC_rx_sv);
10289     }
10290
10291     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10292         nextchar(pRExC_state);
10293         reginsert(pRExC_state, MINMOD, ret, depth+1);
10294         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10295     }
10296     else
10297     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10298         regnode *ender;
10299         nextchar(pRExC_state);
10300         ender = reg_node(pRExC_state, SUCCEED);
10301         REGTAIL(pRExC_state, ret, ender);
10302         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10303         ret->flags = 0;
10304         ender = reg_node(pRExC_state, TAIL);
10305         REGTAIL(pRExC_state, ret, ender);
10306     }
10307
10308     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10309         RExC_parse++;
10310         vFAIL("Nested quantifiers");
10311     }
10312
10313     return(ret);
10314 }
10315
10316 STATIC bool
10317 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10318         const bool strict   /* Apply stricter parsing rules? */
10319     )
10320 {
10321
10322  /* This is expected to be called by a parser routine that has recognized '\N'
10323    and needs to handle the rest. RExC_parse is expected to point at the first
10324    char following the N at the time of the call.  On successful return,
10325    RExC_parse has been updated to point to just after the sequence identified
10326    by this routine, and <*flagp> has been updated.
10327
10328    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10329    character class.
10330
10331    \N may begin either a named sequence, or if outside a character class, mean
10332    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10333    attempted to decide which, and in the case of a named sequence, converted it
10334    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10335    where c1... are the characters in the sequence.  For single-quoted regexes,
10336    the tokenizer passes the \N sequence through unchanged; this code will not
10337    attempt to determine this nor expand those, instead raising a syntax error.
10338    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10339    or there is no '}', it signals that this \N occurrence means to match a
10340    non-newline.
10341
10342    Only the \N{U+...} form should occur in a character class, for the same
10343    reason that '.' inside a character class means to just match a period: it
10344    just doesn't make sense.
10345
10346    The function raises an error (via vFAIL), and doesn't return for various
10347    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10348    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10349    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10350    only possible if node_p is non-NULL.
10351
10352
10353    If <valuep> is non-null, it means the caller can accept an input sequence
10354    consisting of a just a single code point; <*valuep> is set to that value
10355    if the input is such.
10356
10357    If <node_p> is non-null it signifies that the caller can accept any other
10358    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10359    is set as follows:
10360     1) \N means not-a-NL: points to a newly created REG_ANY node;
10361     2) \N{}:              points to a new NOTHING node;
10362     3) otherwise:         points to a new EXACT node containing the resolved
10363                           string.
10364    Note that FALSE is returned for single code point sequences if <valuep> is
10365    null.
10366  */
10367
10368     char * endbrace;    /* '}' following the name */
10369     char* p;
10370     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10371                            stream */
10372     bool has_multiple_chars; /* true if the input stream contains a sequence of
10373                                 more than one character */
10374
10375     GET_RE_DEBUG_FLAGS_DECL;
10376
10377     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10378
10379     GET_RE_DEBUG_FLAGS;
10380
10381     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10382
10383     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10384      * modifier.  The other meaning does not, so use a temporary until we find
10385      * out which we are being called with */
10386     p = (RExC_flags & RXf_PMf_EXTENDED)
10387         ? regwhite( pRExC_state, RExC_parse )
10388         : RExC_parse;
10389
10390     /* Disambiguate between \N meaning a named character versus \N meaning
10391      * [^\n].  The former is assumed when it can't be the latter. */
10392     if (*p != '{' || regcurly(p, FALSE)) {
10393         RExC_parse = p;
10394         if (! node_p) {
10395             /* no bare \N allowed in a charclass */
10396             if (in_char_class) {
10397                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10398             }
10399             return FALSE;
10400         }
10401         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10402                            current char */
10403         nextchar(pRExC_state);
10404         *node_p = reg_node(pRExC_state, REG_ANY);
10405         *flagp |= HASWIDTH|SIMPLE;
10406         RExC_naughty++;
10407         Set_Node_Length(*node_p, 1); /* MJD */
10408         return TRUE;
10409     }
10410
10411     /* Here, we have decided it should be a named character or sequence */
10412
10413     /* The test above made sure that the next real character is a '{', but
10414      * under the /x modifier, it could be separated by space (or a comment and
10415      * \n) and this is not allowed (for consistency with \x{...} and the
10416      * tokenizer handling of \N{NAME}). */
10417     if (*RExC_parse != '{') {
10418         vFAIL("Missing braces on \\N{}");
10419     }
10420
10421     RExC_parse++;       /* Skip past the '{' */
10422
10423     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10424         || ! (endbrace == RExC_parse            /* nothing between the {} */
10425               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
10426                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10427     {
10428         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10429         vFAIL("\\N{NAME} must be resolved by the lexer");
10430     }
10431
10432     if (endbrace == RExC_parse) {   /* empty: \N{} */
10433         bool ret = TRUE;
10434         if (node_p) {
10435             *node_p = reg_node(pRExC_state,NOTHING);
10436         }
10437         else if (in_char_class) {
10438             if (SIZE_ONLY && in_char_class) {
10439                 if (strict) {
10440                     RExC_parse++;   /* Position after the "}" */
10441                     vFAIL("Zero length \\N{}");
10442                 }
10443                 else {
10444                     ckWARNreg(RExC_parse,
10445                               "Ignoring zero length \\N{} in character class");
10446                 }
10447             }
10448             ret = FALSE;
10449         }
10450         else {
10451             return FALSE;
10452         }
10453         nextchar(pRExC_state);
10454         return ret;
10455     }
10456
10457     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10458     RExC_parse += 2;    /* Skip past the 'U+' */
10459
10460     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10461
10462     /* Code points are separated by dots.  If none, there is only one code
10463      * point, and is terminated by the brace */
10464     has_multiple_chars = (endchar < endbrace);
10465
10466     if (valuep && (! has_multiple_chars || in_char_class)) {
10467         /* We only pay attention to the first char of
10468         multichar strings being returned in char classes. I kinda wonder
10469         if this makes sense as it does change the behaviour
10470         from earlier versions, OTOH that behaviour was broken
10471         as well. XXX Solution is to recharacterize as
10472         [rest-of-class]|multi1|multi2... */
10473
10474         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10475         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10476             | PERL_SCAN_DISALLOW_PREFIX
10477             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10478
10479         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10480
10481         /* The tokenizer should have guaranteed validity, but it's possible to
10482          * bypass it by using single quoting, so check */
10483         if (length_of_hex == 0
10484             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10485         {
10486             RExC_parse += length_of_hex;        /* Includes all the valid */
10487             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10488                             ? UTF8SKIP(RExC_parse)
10489                             : 1;
10490             /* Guard against malformed utf8 */
10491             if (RExC_parse >= endchar) {
10492                 RExC_parse = endchar;
10493             }
10494             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10495         }
10496
10497         if (in_char_class && has_multiple_chars) {
10498             if (strict) {
10499                 RExC_parse = endbrace;
10500                 vFAIL("\\N{} in character class restricted to one character");
10501             }
10502             else {
10503                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10504             }
10505         }
10506
10507         RExC_parse = endbrace + 1;
10508     }
10509     else if (! node_p || ! has_multiple_chars) {
10510
10511         /* Here, the input is legal, but not according to the caller's
10512          * options.  We fail without advancing the parse, so that the
10513          * caller can try again */
10514         RExC_parse = p;
10515         return FALSE;
10516     }
10517     else {
10518
10519         /* What is done here is to convert this to a sub-pattern of the form
10520          * (?:\x{char1}\x{char2}...)
10521          * and then call reg recursively.  That way, it retains its atomicness,
10522          * while not having to worry about special handling that some code
10523          * points may have.  toke.c has converted the original Unicode values
10524          * to native, so that we can just pass on the hex values unchanged.  We
10525          * do have to set a flag to keep recoding from happening in the
10526          * recursion */
10527
10528         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10529         STRLEN len;
10530         char *orig_end = RExC_end;
10531         I32 flags;
10532
10533         while (RExC_parse < endbrace) {
10534
10535             /* Convert to notation the rest of the code understands */
10536             sv_catpv(substitute_parse, "\\x{");
10537             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10538             sv_catpv(substitute_parse, "}");
10539
10540             /* Point to the beginning of the next character in the sequence. */
10541             RExC_parse = endchar + 1;
10542             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10543         }
10544         sv_catpv(substitute_parse, ")");
10545
10546         RExC_parse = SvPV(substitute_parse, len);
10547
10548         /* Don't allow empty number */
10549         if (len < 8) {
10550             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10551         }
10552         RExC_end = RExC_parse + len;
10553
10554         /* The values are Unicode, and therefore not subject to recoding */
10555         RExC_override_recoding = 1;
10556
10557         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10558             if (flags & RESTART_UTF8) {
10559                 *flagp = RESTART_UTF8;
10560                 return FALSE;
10561             }
10562             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10563                   (UV) flags);
10564         }
10565         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10566
10567         RExC_parse = endbrace;
10568         RExC_end = orig_end;
10569         RExC_override_recoding = 0;
10570
10571         nextchar(pRExC_state);
10572     }
10573
10574     return TRUE;
10575 }
10576
10577
10578 /*
10579  * reg_recode
10580  *
10581  * It returns the code point in utf8 for the value in *encp.
10582  *    value: a code value in the source encoding
10583  *    encp:  a pointer to an Encode object
10584  *
10585  * If the result from Encode is not a single character,
10586  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10587  */
10588 STATIC UV
10589 S_reg_recode(pTHX_ const char value, SV **encp)
10590 {
10591     STRLEN numlen = 1;
10592     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10593     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10594     const STRLEN newlen = SvCUR(sv);
10595     UV uv = UNICODE_REPLACEMENT;
10596
10597     PERL_ARGS_ASSERT_REG_RECODE;
10598
10599     if (newlen)
10600         uv = SvUTF8(sv)
10601              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10602              : *(U8*)s;
10603
10604     if (!newlen || numlen != newlen) {
10605         uv = UNICODE_REPLACEMENT;
10606         *encp = NULL;
10607     }
10608     return uv;
10609 }
10610
10611 PERL_STATIC_INLINE U8
10612 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10613 {
10614     U8 op;
10615
10616     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10617
10618     if (! FOLD) {
10619         return EXACT;
10620     }
10621
10622     op = get_regex_charset(RExC_flags);
10623     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10624         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10625                  been, so there is no hole */
10626     }
10627
10628     return op + EXACTF;
10629 }
10630
10631 PERL_STATIC_INLINE void
10632 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10633 {
10634     /* This knows the details about sizing an EXACTish node, setting flags for
10635      * it (by setting <*flagp>, and potentially populating it with a single
10636      * character.
10637      *
10638      * If <len> (the length in bytes) is non-zero, this function assumes that
10639      * the node has already been populated, and just does the sizing.  In this
10640      * case <code_point> should be the final code point that has already been
10641      * placed into the node.  This value will be ignored except that under some
10642      * circumstances <*flagp> is set based on it.
10643      *
10644      * If <len> is zero, the function assumes that the node is to contain only
10645      * the single character given by <code_point> and calculates what <len>
10646      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10647      * additionally will populate the node's STRING with <code_point>, if <len>
10648      * is 0.  In both cases <*flagp> is appropriately set
10649      *
10650      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10651      * 255, must be folded (the former only when the rules indicate it can
10652      * match 'ss') */
10653
10654     bool len_passed_in = cBOOL(len != 0);
10655     U8 character[UTF8_MAXBYTES_CASE+1];
10656
10657     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10658
10659     if (! len_passed_in) {
10660         if (UTF) {
10661             if (FOLD && (! LOC || code_point > 255)) {
10662                 _to_uni_fold_flags(code_point,
10663                                    character,
10664                                    &len,
10665                                    FOLD_FLAGS_FULL | ((LOC)
10666                                                      ? FOLD_FLAGS_LOCALE
10667                                                      : (ASCII_FOLD_RESTRICTED)
10668                                                        ? FOLD_FLAGS_NOMIX_ASCII
10669                                                        : 0));
10670             }
10671             else {
10672                 uvchr_to_utf8( character, code_point);
10673                 len = UTF8SKIP(character);
10674             }
10675         }
10676         else if (! FOLD
10677                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10678                  || ASCII_FOLD_RESTRICTED
10679                  || ! AT_LEAST_UNI_SEMANTICS)
10680         {
10681             *character = (U8) code_point;
10682             len = 1;
10683         }
10684         else {
10685             *character = 's';
10686             *(character + 1) = 's';
10687             len = 2;
10688         }
10689     }
10690
10691     if (SIZE_ONLY) {
10692         RExC_size += STR_SZ(len);
10693     }
10694     else {
10695         RExC_emit += STR_SZ(len);
10696         STR_LEN(node) = len;
10697         if (! len_passed_in) {
10698             Copy((char *) character, STRING(node), len, char);
10699         }
10700     }
10701
10702     *flagp |= HASWIDTH;
10703
10704     /* A single character node is SIMPLE, except for the special-cased SHARP S
10705      * under /di. */
10706     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10707         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10708             || ! FOLD || ! DEPENDS_SEMANTICS))
10709     {
10710         *flagp |= SIMPLE;
10711     }
10712 }
10713
10714
10715 /* return atoi(p), unless it's too big to sensibly be a backref,
10716  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10717
10718 static I32
10719 S_backref_value(char *p)
10720 {
10721     char *q = p;
10722
10723     for (;isDIGIT(*q); q++); /* calculate length of num */
10724     if (q - p == 0 || q - p > 9)
10725         return I32_MAX;
10726     return atoi(p);
10727 }
10728
10729
10730 /*
10731  - regatom - the lowest level
10732
10733    Try to identify anything special at the start of the pattern. If there
10734    is, then handle it as required. This may involve generating a single regop,
10735    such as for an assertion; or it may involve recursing, such as to
10736    handle a () structure.
10737
10738    If the string doesn't start with something special then we gobble up
10739    as much literal text as we can.
10740
10741    Once we have been able to handle whatever type of thing started the
10742    sequence, we return.
10743
10744    Note: we have to be careful with escapes, as they can be both literal
10745    and special, and in the case of \10 and friends, context determines which.
10746
10747    A summary of the code structure is:
10748
10749    switch (first_byte) {
10750         cases for each special:
10751             handle this special;
10752             break;
10753         case '\\':
10754             switch (2nd byte) {
10755                 cases for each unambiguous special:
10756                     handle this special;
10757                     break;
10758                 cases for each ambigous special/literal:
10759                     disambiguate;
10760                     if (special)  handle here
10761                     else goto defchar;
10762                 default: // unambiguously literal:
10763                     goto defchar;
10764             }
10765         default:  // is a literal char
10766             // FALL THROUGH
10767         defchar:
10768             create EXACTish node for literal;
10769             while (more input and node isn't full) {
10770                 switch (input_byte) {
10771                    cases for each special;
10772                        make sure parse pointer is set so that the next call to
10773                            regatom will see this special first
10774                        goto loopdone; // EXACTish node terminated by prev. char
10775                    default:
10776                        append char to EXACTISH node;
10777                 }
10778                 get next input byte;
10779             }
10780         loopdone:
10781    }
10782    return the generated node;
10783
10784    Specifically there are two separate switches for handling
10785    escape sequences, with the one for handling literal escapes requiring
10786    a dummy entry for all of the special escapes that are actually handled
10787    by the other.
10788
10789    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10790    TRYAGAIN.
10791    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10792    restarted.
10793    Otherwise does not return NULL.
10794 */
10795
10796 STATIC regnode *
10797 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10798 {
10799     dVAR;
10800     regnode *ret = NULL;
10801     I32 flags = 0;
10802     char *parse_start = RExC_parse;
10803     U8 op;
10804     int invert = 0;
10805
10806     GET_RE_DEBUG_FLAGS_DECL;
10807
10808     *flagp = WORST;             /* Tentatively. */
10809
10810     DEBUG_PARSE("atom");
10811
10812     PERL_ARGS_ASSERT_REGATOM;
10813
10814 tryagain:
10815     switch ((U8)*RExC_parse) {
10816     case '^':
10817         RExC_seen_zerolen++;
10818         nextchar(pRExC_state);
10819         if (RExC_flags & RXf_PMf_MULTILINE)
10820             ret = reg_node(pRExC_state, MBOL);
10821         else if (RExC_flags & RXf_PMf_SINGLELINE)
10822             ret = reg_node(pRExC_state, SBOL);
10823         else
10824             ret = reg_node(pRExC_state, BOL);
10825         Set_Node_Length(ret, 1); /* MJD */
10826         break;
10827     case '$':
10828         nextchar(pRExC_state);
10829         if (*RExC_parse)
10830             RExC_seen_zerolen++;
10831         if (RExC_flags & RXf_PMf_MULTILINE)
10832             ret = reg_node(pRExC_state, MEOL);
10833         else if (RExC_flags & RXf_PMf_SINGLELINE)
10834             ret = reg_node(pRExC_state, SEOL);
10835         else
10836             ret = reg_node(pRExC_state, EOL);
10837         Set_Node_Length(ret, 1); /* MJD */
10838         break;
10839     case '.':
10840         nextchar(pRExC_state);
10841         if (RExC_flags & RXf_PMf_SINGLELINE)
10842             ret = reg_node(pRExC_state, SANY);
10843         else
10844             ret = reg_node(pRExC_state, REG_ANY);
10845         *flagp |= HASWIDTH|SIMPLE;
10846         RExC_naughty++;
10847         Set_Node_Length(ret, 1); /* MJD */
10848         break;
10849     case '[':
10850     {
10851         char * const oregcomp_parse = ++RExC_parse;
10852         ret = regclass(pRExC_state, flagp,depth+1,
10853                        FALSE, /* means parse the whole char class */
10854                        TRUE, /* allow multi-char folds */
10855                        FALSE, /* don't silence non-portable warnings. */
10856                        NULL);
10857         if (*RExC_parse != ']') {
10858             RExC_parse = oregcomp_parse;
10859             vFAIL("Unmatched [");
10860         }
10861         if (ret == NULL) {
10862             if (*flagp & RESTART_UTF8)
10863                 return NULL;
10864             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10865                   (UV) *flagp);
10866         }
10867         nextchar(pRExC_state);
10868         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10869         break;
10870     }
10871     case '(':
10872         nextchar(pRExC_state);
10873         ret = reg(pRExC_state, 2, &flags,depth+1);
10874         if (ret == NULL) {
10875                 if (flags & TRYAGAIN) {
10876                     if (RExC_parse == RExC_end) {
10877                          /* Make parent create an empty node if needed. */
10878                         *flagp |= TRYAGAIN;
10879                         return(NULL);
10880                     }
10881                     goto tryagain;
10882                 }
10883                 if (flags & RESTART_UTF8) {
10884                     *flagp = RESTART_UTF8;
10885                     return NULL;
10886                 }
10887                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10888         }
10889         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10890         break;
10891     case '|':
10892     case ')':
10893         if (flags & TRYAGAIN) {
10894             *flagp |= TRYAGAIN;
10895             return NULL;
10896         }
10897         vFAIL("Internal urp");
10898                                 /* Supposed to be caught earlier. */
10899         break;
10900     case '{':
10901         if (!regcurly(RExC_parse, FALSE)) {
10902             RExC_parse++;
10903             goto defchar;
10904         }
10905         /* FALL THROUGH */
10906     case '?':
10907     case '+':
10908     case '*':
10909         RExC_parse++;
10910         vFAIL("Quantifier follows nothing");
10911         break;
10912     case '\\':
10913         /* Special Escapes
10914
10915            This switch handles escape sequences that resolve to some kind
10916            of special regop and not to literal text. Escape sequnces that
10917            resolve to literal text are handled below in the switch marked
10918            "Literal Escapes".
10919
10920            Every entry in this switch *must* have a corresponding entry
10921            in the literal escape switch. However, the opposite is not
10922            required, as the default for this switch is to jump to the
10923            literal text handling code.
10924         */
10925         switch ((U8)*++RExC_parse) {
10926             U8 arg;
10927         /* Special Escapes */
10928         case 'A':
10929             RExC_seen_zerolen++;
10930             ret = reg_node(pRExC_state, SBOL);
10931             *flagp |= SIMPLE;
10932             goto finish_meta_pat;
10933         case 'G':
10934             ret = reg_node(pRExC_state, GPOS);
10935             RExC_seen |= REG_SEEN_GPOS;
10936             *flagp |= SIMPLE;
10937             goto finish_meta_pat;
10938         case 'K':
10939             RExC_seen_zerolen++;
10940             ret = reg_node(pRExC_state, KEEPS);
10941             *flagp |= SIMPLE;
10942             /* XXX:dmq : disabling in-place substitution seems to
10943              * be necessary here to avoid cases of memory corruption, as
10944              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10945              */
10946             RExC_seen |= REG_SEEN_LOOKBEHIND;
10947             goto finish_meta_pat;
10948         case 'Z':
10949             ret = reg_node(pRExC_state, SEOL);
10950             *flagp |= SIMPLE;
10951             RExC_seen_zerolen++;                /* Do not optimize RE away */
10952             goto finish_meta_pat;
10953         case 'z':
10954             ret = reg_node(pRExC_state, EOS);
10955             *flagp |= SIMPLE;
10956             RExC_seen_zerolen++;                /* Do not optimize RE away */
10957             goto finish_meta_pat;
10958         case 'C':
10959             ret = reg_node(pRExC_state, CANY);
10960             RExC_seen |= REG_SEEN_CANY;
10961             *flagp |= HASWIDTH|SIMPLE;
10962             goto finish_meta_pat;
10963         case 'X':
10964             ret = reg_node(pRExC_state, CLUMP);
10965             *flagp |= HASWIDTH;
10966             goto finish_meta_pat;
10967
10968         case 'W':
10969             invert = 1;
10970             /* FALLTHROUGH */
10971         case 'w':
10972             arg = ANYOF_WORDCHAR;
10973             goto join_posix;
10974
10975         case 'b':
10976             RExC_seen_zerolen++;
10977             RExC_seen |= REG_SEEN_LOOKBEHIND;
10978             op = BOUND + get_regex_charset(RExC_flags);
10979             if (op > BOUNDA) {  /* /aa is same as /a */
10980                 op = BOUNDA;
10981             }
10982             ret = reg_node(pRExC_state, op);
10983             FLAGS(ret) = get_regex_charset(RExC_flags);
10984             *flagp |= SIMPLE;
10985             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10986                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10987             }
10988             goto finish_meta_pat;
10989         case 'B':
10990             RExC_seen_zerolen++;
10991             RExC_seen |= REG_SEEN_LOOKBEHIND;
10992             op = NBOUND + get_regex_charset(RExC_flags);
10993             if (op > NBOUNDA) { /* /aa is same as /a */
10994                 op = NBOUNDA;
10995             }
10996             ret = reg_node(pRExC_state, op);
10997             FLAGS(ret) = get_regex_charset(RExC_flags);
10998             *flagp |= SIMPLE;
10999             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11000                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11001             }
11002             goto finish_meta_pat;
11003
11004         case 'D':
11005             invert = 1;
11006             /* FALLTHROUGH */
11007         case 'd':
11008             arg = ANYOF_DIGIT;
11009             goto join_posix;
11010
11011         case 'R':
11012             ret = reg_node(pRExC_state, LNBREAK);
11013             *flagp |= HASWIDTH|SIMPLE;
11014             goto finish_meta_pat;
11015
11016         case 'H':
11017             invert = 1;
11018             /* FALLTHROUGH */
11019         case 'h':
11020             arg = ANYOF_BLANK;
11021             op = POSIXU;
11022             goto join_posix_op_known;
11023
11024         case 'V':
11025             invert = 1;
11026             /* FALLTHROUGH */
11027         case 'v':
11028             arg = ANYOF_VERTWS;
11029             op = POSIXU;
11030             goto join_posix_op_known;
11031
11032         case 'S':
11033             invert = 1;
11034             /* FALLTHROUGH */
11035         case 's':
11036             arg = ANYOF_SPACE;
11037
11038         join_posix:
11039
11040             op = POSIXD + get_regex_charset(RExC_flags);
11041             if (op > POSIXA) {  /* /aa is same as /a */
11042                 op = POSIXA;
11043             }
11044
11045         join_posix_op_known:
11046
11047             if (invert) {
11048                 op += NPOSIXD - POSIXD;
11049             }
11050
11051             ret = reg_node(pRExC_state, op);
11052             if (! SIZE_ONLY) {
11053                 FLAGS(ret) = namedclass_to_classnum(arg);
11054             }
11055
11056             *flagp |= HASWIDTH|SIMPLE;
11057             /* FALL THROUGH */
11058
11059          finish_meta_pat:
11060             nextchar(pRExC_state);
11061             Set_Node_Length(ret, 2); /* MJD */
11062             break;
11063         case 'p':
11064         case 'P':
11065             {
11066 #ifdef DEBUGGING
11067                 char* parse_start = RExC_parse - 2;
11068 #endif
11069
11070                 RExC_parse--;
11071
11072                 ret = regclass(pRExC_state, flagp,depth+1,
11073                                TRUE, /* means just parse this element */
11074                                FALSE, /* don't allow multi-char folds */
11075                                FALSE, /* don't silence non-portable warnings.
11076                                          It would be a bug if these returned
11077                                          non-portables */
11078                                NULL);
11079                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11080                    are allowed.  */
11081                 if (!ret)
11082                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11083                           (UV) *flagp);
11084
11085                 RExC_parse--;
11086
11087                 Set_Node_Offset(ret, parse_start + 2);
11088                 Set_Node_Cur_Length(ret, parse_start);
11089                 nextchar(pRExC_state);
11090             }
11091             break;
11092         case 'N':
11093             /* Handle \N and \N{NAME} with multiple code points here and not
11094              * below because it can be multicharacter. join_exact() will join
11095              * them up later on.  Also this makes sure that things like
11096              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11097              * The options to the grok function call causes it to fail if the
11098              * sequence is just a single code point.  We then go treat it as
11099              * just another character in the current EXACT node, and hence it
11100              * gets uniform treatment with all the other characters.  The
11101              * special treatment for quantifiers is not needed for such single
11102              * character sequences */
11103             ++RExC_parse;
11104             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11105                                 FALSE /* not strict */ )) {
11106                 if (*flagp & RESTART_UTF8)
11107                     return NULL;
11108                 RExC_parse--;
11109                 goto defchar;
11110             }
11111             break;
11112         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11113         parse_named_seq:
11114         {
11115             char ch= RExC_parse[1];
11116             if (ch != '<' && ch != '\'' && ch != '{') {
11117                 RExC_parse++;
11118                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11119                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11120             } else {
11121                 /* this pretty much dupes the code for (?P=...) in reg(), if
11122                    you change this make sure you change that */
11123                 char* name_start = (RExC_parse += 2);
11124                 U32 num = 0;
11125                 SV *sv_dat = reg_scan_name(pRExC_state,
11126                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11127                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11128                 if (RExC_parse == name_start || *RExC_parse != ch)
11129                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11130                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11131
11132                 if (!SIZE_ONLY) {
11133                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11134                     RExC_rxi->data->data[num]=(void*)sv_dat;
11135                     SvREFCNT_inc_simple_void(sv_dat);
11136                 }
11137
11138                 RExC_sawback = 1;
11139                 ret = reganode(pRExC_state,
11140                                ((! FOLD)
11141                                  ? NREF
11142                                  : (ASCII_FOLD_RESTRICTED)
11143                                    ? NREFFA
11144                                    : (AT_LEAST_UNI_SEMANTICS)
11145                                      ? NREFFU
11146                                      : (LOC)
11147                                        ? NREFFL
11148                                        : NREFF),
11149                                 num);
11150                 *flagp |= HASWIDTH;
11151
11152                 /* override incorrect value set in reganode MJD */
11153                 Set_Node_Offset(ret, parse_start+1);
11154                 Set_Node_Cur_Length(ret, parse_start);
11155                 nextchar(pRExC_state);
11156
11157             }
11158             break;
11159         }
11160         case 'g':
11161         case '1': case '2': case '3': case '4':
11162         case '5': case '6': case '7': case '8': case '9':
11163             {
11164                 I32 num;
11165                 bool hasbrace = 0;
11166
11167                 if (*RExC_parse == 'g') {
11168                     bool isrel = 0;
11169
11170                     RExC_parse++;
11171                     if (*RExC_parse == '{') {
11172                         RExC_parse++;
11173                         hasbrace = 1;
11174                     }
11175                     if (*RExC_parse == '-') {
11176                         RExC_parse++;
11177                         isrel = 1;
11178                     }
11179                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11180                         if (isrel) RExC_parse--;
11181                         RExC_parse -= 2;
11182                         goto parse_named_seq;
11183                     }
11184
11185                     num = S_backref_value(RExC_parse);
11186                     if (num == 0)
11187                         vFAIL("Reference to invalid group 0");
11188                     else if (num == I32_MAX) {
11189                          if (isDIGIT(*RExC_parse))
11190                             vFAIL("Reference to nonexistent group");
11191                         else
11192                             vFAIL("Unterminated \\g... pattern");
11193                     }
11194
11195                     if (isrel) {
11196                         num = RExC_npar - num;
11197                         if (num < 1)
11198                             vFAIL("Reference to nonexistent or unclosed group");
11199                     }
11200                 }
11201                 else {
11202                     num = S_backref_value(RExC_parse);
11203                     /* bare \NNN might be backref or octal */
11204                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11205                             && *RExC_parse != '8' && *RExC_parse != '9'))
11206                         /* Probably a character specified in octal, e.g. \35 */
11207                         goto defchar;
11208                 }
11209
11210                 /* at this point RExC_parse definitely points to a backref
11211                  * number */
11212                 {
11213 #ifdef RE_TRACK_PATTERN_OFFSETS
11214                     char * const parse_start = RExC_parse - 1; /* MJD */
11215 #endif
11216                     while (isDIGIT(*RExC_parse))
11217                         RExC_parse++;
11218                     if (hasbrace) {
11219                         if (*RExC_parse != '}')
11220                             vFAIL("Unterminated \\g{...} pattern");
11221                         RExC_parse++;
11222                     }
11223                     if (!SIZE_ONLY) {
11224                         if (num > (I32)RExC_rx->nparens)
11225                             vFAIL("Reference to nonexistent group");
11226                     }
11227                     RExC_sawback = 1;
11228                     ret = reganode(pRExC_state,
11229                                    ((! FOLD)
11230                                      ? REF
11231                                      : (ASCII_FOLD_RESTRICTED)
11232                                        ? REFFA
11233                                        : (AT_LEAST_UNI_SEMANTICS)
11234                                          ? REFFU
11235                                          : (LOC)
11236                                            ? REFFL
11237                                            : REFF),
11238                                     num);
11239                     *flagp |= HASWIDTH;
11240
11241                     /* override incorrect value set in reganode MJD */
11242                     Set_Node_Offset(ret, parse_start+1);
11243                     Set_Node_Cur_Length(ret, parse_start);
11244                     RExC_parse--;
11245                     nextchar(pRExC_state);
11246                 }
11247             }
11248             break;
11249         case '\0':
11250             if (RExC_parse >= RExC_end)
11251                 FAIL("Trailing \\");
11252             /* FALL THROUGH */
11253         default:
11254             /* Do not generate "unrecognized" warnings here, we fall
11255                back into the quick-grab loop below */
11256             parse_start--;
11257             goto defchar;
11258         }
11259         break;
11260
11261     case '#':
11262         if (RExC_flags & RXf_PMf_EXTENDED) {
11263             if ( reg_skipcomment( pRExC_state ) )
11264                 goto tryagain;
11265         }
11266         /* FALL THROUGH */
11267
11268     default:
11269
11270             parse_start = RExC_parse - 1;
11271
11272             RExC_parse++;
11273
11274         defchar: {
11275             STRLEN len = 0;
11276             UV ender = 0;
11277             char *p;
11278             char *s;
11279 #define MAX_NODE_STRING_SIZE 127
11280             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11281             char *s0;
11282             U8 upper_parse = MAX_NODE_STRING_SIZE;
11283             STRLEN foldlen;
11284             U8 node_type = compute_EXACTish(pRExC_state);
11285             bool next_is_quantifier;
11286             char * oldp = NULL;
11287
11288             /* We can convert EXACTF nodes to EXACTFU if they contain only
11289              * characters that match identically regardless of the target
11290              * string's UTF8ness.  The reason to do this is that EXACTF is not
11291              * trie-able, EXACTFU is.  (We don't need to figure this out until
11292              * pass 2) */
11293             bool maybe_exactfu = node_type == EXACTF && PASS2;
11294
11295             /* If a folding node contains only code points that don't
11296              * participate in folds, it can be changed into an EXACT node,
11297              * which allows the optimizer more things to look for */
11298             bool maybe_exact;
11299
11300             ret = reg_node(pRExC_state, node_type);
11301
11302             /* In pass1, folded, we use a temporary buffer instead of the
11303              * actual node, as the node doesn't exist yet */
11304             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11305
11306             s0 = s;
11307
11308         reparse:
11309
11310             /* We do the EXACTFish to EXACT node only if folding, and not if in
11311              * locale, as whether a character folds or not isn't known until
11312              * runtime.  (And we don't need to figure this out until pass 2) */
11313             maybe_exact = FOLD && ! LOC && PASS2;
11314
11315             /* XXX The node can hold up to 255 bytes, yet this only goes to
11316              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11317              * 255 allows us to not have to worry about overflow due to
11318              * converting to utf8 and fold expansion, but that value is
11319              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11320              * split up by this limit into a single one using the real max of
11321              * 255.  Even at 127, this breaks under rare circumstances.  If
11322              * folding, we do not want to split a node at a character that is a
11323              * non-final in a multi-char fold, as an input string could just
11324              * happen to want to match across the node boundary.  The join
11325              * would solve that problem if the join actually happens.  But a
11326              * series of more than two nodes in a row each of 127 would cause
11327              * the first join to succeed to get to 254, but then there wouldn't
11328              * be room for the next one, which could at be one of those split
11329              * multi-char folds.  I don't know of any fool-proof solution.  One
11330              * could back off to end with only a code point that isn't such a
11331              * non-final, but it is possible for there not to be any in the
11332              * entire node. */
11333             for (p = RExC_parse - 1;
11334                  len < upper_parse && p < RExC_end;
11335                  len++)
11336             {
11337                 oldp = p;
11338
11339                 if (RExC_flags & RXf_PMf_EXTENDED)
11340                     p = regwhite( pRExC_state, p );
11341                 switch ((U8)*p) {
11342                 case '^':
11343                 case '$':
11344                 case '.':
11345                 case '[':
11346                 case '(':
11347                 case ')':
11348                 case '|':
11349                     goto loopdone;
11350                 case '\\':
11351                     /* Literal Escapes Switch
11352
11353                        This switch is meant to handle escape sequences that
11354                        resolve to a literal character.
11355
11356                        Every escape sequence that represents something
11357                        else, like an assertion or a char class, is handled
11358                        in the switch marked 'Special Escapes' above in this
11359                        routine, but also has an entry here as anything that
11360                        isn't explicitly mentioned here will be treated as
11361                        an unescaped equivalent literal.
11362                     */
11363
11364                     switch ((U8)*++p) {
11365                     /* These are all the special escapes. */
11366                     case 'A':             /* Start assertion */
11367                     case 'b': case 'B':   /* Word-boundary assertion*/
11368                     case 'C':             /* Single char !DANGEROUS! */
11369                     case 'd': case 'D':   /* digit class */
11370                     case 'g': case 'G':   /* generic-backref, pos assertion */
11371                     case 'h': case 'H':   /* HORIZWS */
11372                     case 'k': case 'K':   /* named backref, keep marker */
11373                     case 'p': case 'P':   /* Unicode property */
11374                               case 'R':   /* LNBREAK */
11375                     case 's': case 'S':   /* space class */
11376                     case 'v': case 'V':   /* VERTWS */
11377                     case 'w': case 'W':   /* word class */
11378                     case 'X':             /* eXtended Unicode "combining character sequence" */
11379                     case 'z': case 'Z':   /* End of line/string assertion */
11380                         --p;
11381                         goto loopdone;
11382
11383                     /* Anything after here is an escape that resolves to a
11384                        literal. (Except digits, which may or may not)
11385                      */
11386                     case 'n':
11387                         ender = '\n';
11388                         p++;
11389                         break;
11390                     case 'N': /* Handle a single-code point named character. */
11391                         /* The options cause it to fail if a multiple code
11392                          * point sequence.  Handle those in the switch() above
11393                          * */
11394                         RExC_parse = p + 1;
11395                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11396                                             flagp, depth, FALSE,
11397                                             FALSE /* not strict */ ))
11398                         {
11399                             if (*flagp & RESTART_UTF8)
11400                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11401                             RExC_parse = p = oldp;
11402                             goto loopdone;
11403                         }
11404                         p = RExC_parse;
11405                         if (ender > 0xff) {
11406                             REQUIRE_UTF8;
11407                         }
11408                         break;
11409                     case 'r':
11410                         ender = '\r';
11411                         p++;
11412                         break;
11413                     case 't':
11414                         ender = '\t';
11415                         p++;
11416                         break;
11417                     case 'f':
11418                         ender = '\f';
11419                         p++;
11420                         break;
11421                     case 'e':
11422                           ender = ASCII_TO_NATIVE('\033');
11423                         p++;
11424                         break;
11425                     case 'a':
11426                           ender = '\a';
11427                         p++;
11428                         break;
11429                     case 'o':
11430                         {
11431                             UV result;
11432                             const char* error_msg;
11433
11434                             bool valid = grok_bslash_o(&p,
11435                                                        &result,
11436                                                        &error_msg,
11437                                                        TRUE, /* out warnings */
11438                                                        FALSE, /* not strict */
11439                                                        TRUE, /* Output warnings
11440                                                                 for non-
11441                                                                 portables */
11442                                                        UTF);
11443                             if (! valid) {
11444                                 RExC_parse = p; /* going to die anyway; point
11445                                                    to exact spot of failure */
11446                                 vFAIL(error_msg);
11447                             }
11448                             ender = result;
11449                             if (PL_encoding && ender < 0x100) {
11450                                 goto recode_encoding;
11451                             }
11452                             if (ender > 0xff) {
11453                                 REQUIRE_UTF8;
11454                             }
11455                             break;
11456                         }
11457                     case 'x':
11458                         {
11459                             UV result = UV_MAX; /* initialize to erroneous
11460                                                    value */
11461                             const char* error_msg;
11462
11463                             bool valid = grok_bslash_x(&p,
11464                                                        &result,
11465                                                        &error_msg,
11466                                                        TRUE, /* out warnings */
11467                                                        FALSE, /* not strict */
11468                                                        TRUE, /* Output warnings
11469                                                                 for non-
11470                                                                 portables */
11471                                                        UTF);
11472                             if (! valid) {
11473                                 RExC_parse = p; /* going to die anyway; point
11474                                                    to exact spot of failure */
11475                                 vFAIL(error_msg);
11476                             }
11477                             ender = result;
11478
11479                             if (PL_encoding && ender < 0x100) {
11480                                 goto recode_encoding;
11481                             }
11482                             if (ender > 0xff) {
11483                                 REQUIRE_UTF8;
11484                             }
11485                             break;
11486                         }
11487                     case 'c':
11488                         p++;
11489                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11490                         break;
11491                     case '8': case '9': /* must be a backreference */
11492                         --p;
11493                         goto loopdone;
11494                     case '1': case '2': case '3':case '4':
11495                     case '5': case '6': case '7':
11496                         /* When we parse backslash escapes there is ambiguity
11497                          * between backreferences and octal escapes. Any escape
11498                          * from \1 - \9 is a backreference, any multi-digit
11499                          * escape which does not start with 0 and which when
11500                          * evaluated as decimal could refer to an already
11501                          * parsed capture buffer is a backslash. Anything else
11502                          * is octal.
11503                          *
11504                          * Note this implies that \118 could be interpreted as
11505                          * 118 OR as "\11" . "8" depending on whether there
11506                          * were 118 capture buffers defined already in the
11507                          * pattern.  */
11508                         if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11509                         {  /* Not to be treated as an octal constant, go
11510                                    find backref */
11511                             --p;
11512                             goto loopdone;
11513                         }
11514                     case '0':
11515                         {
11516                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11517                             STRLEN numlen = 3;
11518                             ender = grok_oct(p, &numlen, &flags, NULL);
11519                             if (ender > 0xff) {
11520                                 REQUIRE_UTF8;
11521                             }
11522                             p += numlen;
11523                             if (SIZE_ONLY   /* like \08, \178 */
11524                                 && numlen < 3
11525                                 && p < RExC_end
11526                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11527                             {
11528                                 reg_warn_non_literal_string(
11529                                          p + 1,
11530                                          form_short_octal_warning(p, numlen));
11531                             }
11532                         }
11533                         if (PL_encoding && ender < 0x100)
11534                             goto recode_encoding;
11535                         break;
11536                     recode_encoding:
11537                         if (! RExC_override_recoding) {
11538                             SV* enc = PL_encoding;
11539                             ender = reg_recode((const char)(U8)ender, &enc);
11540                             if (!enc && SIZE_ONLY)
11541                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11542                             REQUIRE_UTF8;
11543                         }
11544                         break;
11545                     case '\0':
11546                         if (p >= RExC_end)
11547                             FAIL("Trailing \\");
11548                         /* FALL THROUGH */
11549                     default:
11550                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11551                             /* Include any { following the alpha to emphasize
11552                              * that it could be part of an escape at some point
11553                              * in the future */
11554                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11555                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11556                         }
11557                         goto normal_default;
11558                     } /* End of switch on '\' */
11559                     break;
11560                 default:    /* A literal character */
11561
11562                     if (! SIZE_ONLY
11563                         && RExC_flags & RXf_PMf_EXTENDED
11564                         && ckWARN_d(WARN_DEPRECATED)
11565                         && is_PATWS_non_low(p, UTF))
11566                     {
11567                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11568                                 "Escape literal pattern white space under /x");
11569                     }
11570
11571                   normal_default:
11572                     if (UTF8_IS_START(*p) && UTF) {
11573                         STRLEN numlen;
11574                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11575                                                &numlen, UTF8_ALLOW_DEFAULT);
11576                         p += numlen;
11577                     }
11578                     else
11579                         ender = (U8) *p++;
11580                     break;
11581                 } /* End of switch on the literal */
11582
11583                 /* Here, have looked at the literal character and <ender>
11584                  * contains its ordinal, <p> points to the character after it
11585                  */
11586
11587                 if ( RExC_flags & RXf_PMf_EXTENDED)
11588                     p = regwhite( pRExC_state, p );
11589
11590                 /* If the next thing is a quantifier, it applies to this
11591                  * character only, which means that this character has to be in
11592                  * its own node and can't just be appended to the string in an
11593                  * existing node, so if there are already other characters in
11594                  * the node, close the node with just them, and set up to do
11595                  * this character again next time through, when it will be the
11596                  * only thing in its new node */
11597                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11598                 {
11599                     p = oldp;
11600                     goto loopdone;
11601                 }
11602
11603                 if (! FOLD) {
11604                     if (UTF) {
11605                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11606                         if (unilen > 0) {
11607                            s   += unilen;
11608                            len += unilen;
11609                         }
11610
11611                         /* The loop increments <len> each time, as all but this
11612                          * path (and one other) through it add a single byte to
11613                          * the EXACTish node.  But this one has changed len to
11614                          * be the correct final value, so subtract one to
11615                          * cancel out the increment that follows */
11616                         len--;
11617                     }
11618                     else {
11619                         REGC((char)ender, s++);
11620                     }
11621                 }
11622                 else /* FOLD */ if (! ( UTF
11623                         /* See comments for join_exact() as to why we fold this
11624                          * non-UTF at compile time */
11625                         || (node_type == EXACTFU
11626                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11627                 {
11628                     if (IS_IN_SOME_FOLD_L1(ender)) {
11629                         maybe_exact = FALSE;
11630
11631                         /* See if the character's fold differs between /d and
11632                          * /u.  This includes the multi-char fold SHARP S to
11633                          * 'ss' */
11634                         if (maybe_exactfu
11635                             && (PL_fold[ender] != PL_fold_latin1[ender]
11636                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11637                                 || (len > 0
11638                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11639                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11640                         {
11641                             maybe_exactfu = FALSE;
11642                         }
11643                     }
11644                     *(s++) = (char) ender;
11645                 }
11646                 else {  /* UTF */
11647
11648                     /* Prime the casefolded buffer.  Locale rules, which apply
11649                      * only to code points < 256, aren't known until execution,
11650                      * so for them, just output the original character using
11651                      * utf8.  If we start to fold non-UTF patterns, be sure to
11652                      * update join_exact() */
11653                     if (LOC && ender < 256) {
11654                         if (UVCHR_IS_INVARIANT(ender)) {
11655                             *s = (U8) ender;
11656                             foldlen = 1;
11657                         } else {
11658                             *s = UTF8_TWO_BYTE_HI(ender);
11659                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11660                             foldlen = 2;
11661                         }
11662                     }
11663                     else {
11664                         UV folded = _to_uni_fold_flags(
11665                                        ender,
11666                                        (U8 *) s,
11667                                        &foldlen,
11668                                        FOLD_FLAGS_FULL
11669                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11670                                                 : (ASCII_FOLD_RESTRICTED)
11671                                                   ? FOLD_FLAGS_NOMIX_ASCII
11672                                                   : 0)
11673                                         );
11674
11675                         /* If this node only contains non-folding code points
11676                          * so far, see if this new one is also non-folding */
11677                         if (maybe_exact) {
11678                             if (folded != ender) {
11679                                 maybe_exact = FALSE;
11680                             }
11681                             else {
11682                                 /* Here the fold is the original; we have
11683                                  * to check further to see if anything
11684                                  * folds to it */
11685                                 if (! PL_utf8_foldable) {
11686                                     SV* swash = swash_init("utf8",
11687                                                        "_Perl_Any_Folds",
11688                                                        &PL_sv_undef, 1, 0);
11689                                     PL_utf8_foldable =
11690                                                 _get_swash_invlist(swash);
11691                                     SvREFCNT_dec_NN(swash);
11692                                 }
11693                                 if (_invlist_contains_cp(PL_utf8_foldable,
11694                                                          ender))
11695                                 {
11696                                     maybe_exact = FALSE;
11697                                 }
11698                             }
11699                         }
11700                         ender = folded;
11701                     }
11702                     s += foldlen;
11703
11704                     /* The loop increments <len> each time, as all but this
11705                      * path (and one other) through it add a single byte to the
11706                      * EXACTish node.  But this one has changed len to be the
11707                      * correct final value, so subtract one to cancel out the
11708                      * increment that follows */
11709                     len += foldlen - 1;
11710                 }
11711
11712                 if (next_is_quantifier) {
11713
11714                     /* Here, the next input is a quantifier, and to get here,
11715                      * the current character is the only one in the node.
11716                      * Also, here <len> doesn't include the final byte for this
11717                      * character */
11718                     len++;
11719                     goto loopdone;
11720                 }
11721
11722             } /* End of loop through literal characters */
11723
11724             /* Here we have either exhausted the input or ran out of room in
11725              * the node.  (If we encountered a character that can't be in the
11726              * node, transfer is made directly to <loopdone>, and so we
11727              * wouldn't have fallen off the end of the loop.)  In the latter
11728              * case, we artificially have to split the node into two, because
11729              * we just don't have enough space to hold everything.  This
11730              * creates a problem if the final character participates in a
11731              * multi-character fold in the non-final position, as a match that
11732              * should have occurred won't, due to the way nodes are matched,
11733              * and our artificial boundary.  So back off until we find a non-
11734              * problematic character -- one that isn't at the beginning or
11735              * middle of such a fold.  (Either it doesn't participate in any
11736              * folds, or appears only in the final position of all the folds it
11737              * does participate in.)  A better solution with far fewer false
11738              * positives, and that would fill the nodes more completely, would
11739              * be to actually have available all the multi-character folds to
11740              * test against, and to back-off only far enough to be sure that
11741              * this node isn't ending with a partial one.  <upper_parse> is set
11742              * further below (if we need to reparse the node) to include just
11743              * up through that final non-problematic character that this code
11744              * identifies, so when it is set to less than the full node, we can
11745              * skip the rest of this */
11746             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11747
11748                 const STRLEN full_len = len;
11749
11750                 assert(len >= MAX_NODE_STRING_SIZE);
11751
11752                 /* Here, <s> points to the final byte of the final character.
11753                  * Look backwards through the string until find a non-
11754                  * problematic character */
11755
11756                 if (! UTF) {
11757
11758                     /* These two have no multi-char folds to non-UTF characters
11759                      */
11760                     if (ASCII_FOLD_RESTRICTED || LOC) {
11761                         goto loopdone;
11762                     }
11763
11764                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11765                     len = s - s0 + 1;
11766                 }
11767                 else {
11768                     if (!  PL_NonL1NonFinalFold) {
11769                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11770                                         NonL1_Perl_Non_Final_Folds_invlist);
11771                     }
11772
11773                     /* Point to the first byte of the final character */
11774                     s = (char *) utf8_hop((U8 *) s, -1);
11775
11776                     while (s >= s0) {   /* Search backwards until find
11777                                            non-problematic char */
11778                         if (UTF8_IS_INVARIANT(*s)) {
11779
11780                             /* There are no ascii characters that participate
11781                              * in multi-char folds under /aa.  In EBCDIC, the
11782                              * non-ascii invariants are all control characters,
11783                              * so don't ever participate in any folds. */
11784                             if (ASCII_FOLD_RESTRICTED
11785                                 || ! IS_NON_FINAL_FOLD(*s))
11786                             {
11787                                 break;
11788                             }
11789                         }
11790                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11791
11792                             /* No Latin1 characters participate in multi-char
11793                              * folds under /l */
11794                             if (LOC
11795                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11796                                                                   *s, *(s+1))))
11797                             {
11798                                 break;
11799                             }
11800                         }
11801                         else if (! _invlist_contains_cp(
11802                                         PL_NonL1NonFinalFold,
11803                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11804                         {
11805                             break;
11806                         }
11807
11808                         /* Here, the current character is problematic in that
11809                          * it does occur in the non-final position of some
11810                          * fold, so try the character before it, but have to
11811                          * special case the very first byte in the string, so
11812                          * we don't read outside the string */
11813                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11814                     } /* End of loop backwards through the string */
11815
11816                     /* If there were only problematic characters in the string,
11817                      * <s> will point to before s0, in which case the length
11818                      * should be 0, otherwise include the length of the
11819                      * non-problematic character just found */
11820                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11821                 }
11822
11823                 /* Here, have found the final character, if any, that is
11824                  * non-problematic as far as ending the node without splitting
11825                  * it across a potential multi-char fold.  <len> contains the
11826                  * number of bytes in the node up-to and including that
11827                  * character, or is 0 if there is no such character, meaning
11828                  * the whole node contains only problematic characters.  In
11829                  * this case, give up and just take the node as-is.  We can't
11830                  * do any better */
11831                 if (len == 0) {
11832                     len = full_len;
11833
11834                     /* If the node ends in an 's' we make sure it stays EXACTF,
11835                      * as if it turns into an EXACTFU, it could later get
11836                      * joined with another 's' that would then wrongly match
11837                      * the sharp s */
11838                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11839                     {
11840                         maybe_exactfu = FALSE;
11841                     }
11842                 } else {
11843
11844                     /* Here, the node does contain some characters that aren't
11845                      * problematic.  If one such is the final character in the
11846                      * node, we are done */
11847                     if (len == full_len) {
11848                         goto loopdone;
11849                     }
11850                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11851
11852                         /* If the final character is problematic, but the
11853                          * penultimate is not, back-off that last character to
11854                          * later start a new node with it */
11855                         p = oldp;
11856                         goto loopdone;
11857                     }
11858
11859                     /* Here, the final non-problematic character is earlier
11860                      * in the input than the penultimate character.  What we do
11861                      * is reparse from the beginning, going up only as far as
11862                      * this final ok one, thus guaranteeing that the node ends
11863                      * in an acceptable character.  The reason we reparse is
11864                      * that we know how far in the character is, but we don't
11865                      * know how to correlate its position with the input parse.
11866                      * An alternate implementation would be to build that
11867                      * correlation as we go along during the original parse,
11868                      * but that would entail extra work for every node, whereas
11869                      * this code gets executed only when the string is too
11870                      * large for the node, and the final two characters are
11871                      * problematic, an infrequent occurrence.  Yet another
11872                      * possible strategy would be to save the tail of the
11873                      * string, and the next time regatom is called, initialize
11874                      * with that.  The problem with this is that unless you
11875                      * back off one more character, you won't be guaranteed
11876                      * regatom will get called again, unless regbranch,
11877                      * regpiece ... are also changed.  If you do back off that
11878                      * extra character, so that there is input guaranteed to
11879                      * force calling regatom, you can't handle the case where
11880                      * just the first character in the node is acceptable.  I
11881                      * (khw) decided to try this method which doesn't have that
11882                      * pitfall; if performance issues are found, we can do a
11883                      * combination of the current approach plus that one */
11884                     upper_parse = len;
11885                     len = 0;
11886                     s = s0;
11887                     goto reparse;
11888                 }
11889             }   /* End of verifying node ends with an appropriate char */
11890
11891         loopdone:   /* Jumped to when encounters something that shouldn't be in
11892                        the node */
11893
11894             /* I (khw) don't know if you can get here with zero length, but the
11895              * old code handled this situation by creating a zero-length EXACT
11896              * node.  Might as well be NOTHING instead */
11897             if (len == 0) {
11898                 OP(ret) = NOTHING;
11899             }
11900             else {
11901                 if (FOLD) {
11902                     /* If 'maybe_exact' is still set here, means there are no
11903                      * code points in the node that participate in folds;
11904                      * similarly for 'maybe_exactfu' and code points that match
11905                      * differently depending on UTF8ness of the target string
11906                      * */
11907                     if (maybe_exact) {
11908                         OP(ret) = EXACT;
11909                     }
11910                     else if (maybe_exactfu) {
11911                         OP(ret) = EXACTFU;
11912                     }
11913                 }
11914                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11915             }
11916
11917             RExC_parse = p - 1;
11918             Set_Node_Cur_Length(ret, parse_start);
11919             nextchar(pRExC_state);
11920             {
11921                 /* len is STRLEN which is unsigned, need to copy to signed */
11922                 IV iv = len;
11923                 if (iv < 0)
11924                     vFAIL("Internal disaster");
11925             }
11926
11927         } /* End of label 'defchar:' */
11928         break;
11929     } /* End of giant switch on input character */
11930
11931     return(ret);
11932 }
11933
11934 STATIC char *
11935 S_regwhite( RExC_state_t *pRExC_state, char *p )
11936 {
11937     const char *e = RExC_end;
11938
11939     PERL_ARGS_ASSERT_REGWHITE;
11940
11941     while (p < e) {
11942         if (isSPACE(*p))
11943             ++p;
11944         else if (*p == '#') {
11945             bool ended = 0;
11946             do {
11947                 if (*p++ == '\n') {
11948                     ended = 1;
11949                     break;
11950                 }
11951             } while (p < e);
11952             if (!ended)
11953                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11954         }
11955         else
11956             break;
11957     }
11958     return p;
11959 }
11960
11961 STATIC char *
11962 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11963 {
11964     /* Returns the next non-pattern-white space, non-comment character (the
11965      * latter only if 'recognize_comment is true) in the string p, which is
11966      * ended by RExC_end.  If there is no line break ending a comment,
11967      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11968     const char *e = RExC_end;
11969
11970     PERL_ARGS_ASSERT_REGPATWS;
11971
11972     while (p < e) {
11973         STRLEN len;
11974         if ((len = is_PATWS_safe(p, e, UTF))) {
11975             p += len;
11976         }
11977         else if (recognize_comment && *p == '#') {
11978             bool ended = 0;
11979             do {
11980                 p++;
11981                 if (is_LNBREAK_safe(p, e, UTF)) {
11982                     ended = 1;
11983                     break;
11984                 }
11985             } while (p < e);
11986             if (!ended)
11987                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11988         }
11989         else
11990             break;
11991     }
11992     return p;
11993 }
11994
11995 STATIC void
11996 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
11997 {
11998     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
11999      * sets up the bitmap and any flags, removing those code points from the
12000      * inversion list, setting it to NULL should it become completely empty */
12001
12002     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12003     assert(PL_regkind[OP(node)] == ANYOF);
12004
12005     ANYOF_BITMAP_ZERO(node);
12006     if (*invlist_ptr) {
12007
12008         /* This gets set if we actually need to modify things */
12009         bool change_invlist = FALSE;
12010
12011         UV start, end;
12012
12013         /* Start looking through *invlist_ptr */
12014         invlist_iterinit(*invlist_ptr);
12015         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12016             UV high;
12017             int i;
12018
12019             if (end == UV_MAX && start <= 256) {
12020                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12021             }
12022
12023             /* Quit if are above what we should change */
12024             if (start > 255) {
12025                 break;
12026             }
12027
12028             change_invlist = TRUE;
12029
12030             /* Set all the bits in the range, up to the max that we are doing */
12031             high = (end < 255) ? end : 255;
12032             for (i = start; i <= (int) high; i++) {
12033                 if (! ANYOF_BITMAP_TEST(node, i)) {
12034                     ANYOF_BITMAP_SET(node, i);
12035                 }
12036             }
12037         }
12038         invlist_iterfinish(*invlist_ptr);
12039
12040         /* Done with loop; remove any code points that are in the bitmap from
12041          * *invlist_ptr; similarly for code points above latin1 if we have a flag
12042          * to match all of them anyways */
12043         if (change_invlist) {
12044             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12045         }
12046         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12047             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12048         }
12049
12050         /* If have completely emptied it, remove it completely */
12051         if (_invlist_len(*invlist_ptr) == 0) {
12052             SvREFCNT_dec_NN(*invlist_ptr);
12053             *invlist_ptr = NULL;
12054         }
12055     }
12056 }
12057
12058 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12059    Character classes ([:foo:]) can also be negated ([:^foo:]).
12060    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12061    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12062    but trigger failures because they are currently unimplemented. */
12063
12064 #define POSIXCC_DONE(c)   ((c) == ':')
12065 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12066 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12067
12068 PERL_STATIC_INLINE I32
12069 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12070 {
12071     dVAR;
12072     I32 namedclass = OOB_NAMEDCLASS;
12073
12074     PERL_ARGS_ASSERT_REGPPOSIXCC;
12075
12076     if (value == '[' && RExC_parse + 1 < RExC_end &&
12077         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12078         POSIXCC(UCHARAT(RExC_parse)))
12079     {
12080         const char c = UCHARAT(RExC_parse);
12081         char* const s = RExC_parse++;
12082
12083         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12084             RExC_parse++;
12085         if (RExC_parse == RExC_end) {
12086             if (strict) {
12087
12088                 /* Try to give a better location for the error (than the end of
12089                  * the string) by looking for the matching ']' */
12090                 RExC_parse = s;
12091                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12092                     RExC_parse++;
12093                 }
12094                 vFAIL2("Unmatched '%c' in POSIX class", c);
12095             }
12096             /* Grandfather lone [:, [=, [. */
12097             RExC_parse = s;
12098         }
12099         else {
12100             const char* const t = RExC_parse++; /* skip over the c */
12101             assert(*t == c);
12102
12103             if (UCHARAT(RExC_parse) == ']') {
12104                 const char *posixcc = s + 1;
12105                 RExC_parse++; /* skip over the ending ] */
12106
12107                 if (*s == ':') {
12108                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12109                     const I32 skip = t - posixcc;
12110
12111                     /* Initially switch on the length of the name.  */
12112                     switch (skip) {
12113                     case 4:
12114                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12115                                                           this is the Perl \w
12116                                                         */
12117                             namedclass = ANYOF_WORDCHAR;
12118                         break;
12119                     case 5:
12120                         /* Names all of length 5.  */
12121                         /* alnum alpha ascii blank cntrl digit graph lower
12122                            print punct space upper  */
12123                         /* Offset 4 gives the best switch position.  */
12124                         switch (posixcc[4]) {
12125                         case 'a':
12126                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12127                                 namedclass = ANYOF_ALPHA;
12128                             break;
12129                         case 'e':
12130                             if (memEQ(posixcc, "spac", 4)) /* space */
12131                                 namedclass = ANYOF_PSXSPC;
12132                             break;
12133                         case 'h':
12134                             if (memEQ(posixcc, "grap", 4)) /* graph */
12135                                 namedclass = ANYOF_GRAPH;
12136                             break;
12137                         case 'i':
12138                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12139                                 namedclass = ANYOF_ASCII;
12140                             break;
12141                         case 'k':
12142                             if (memEQ(posixcc, "blan", 4)) /* blank */
12143                                 namedclass = ANYOF_BLANK;
12144                             break;
12145                         case 'l':
12146                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12147                                 namedclass = ANYOF_CNTRL;
12148                             break;
12149                         case 'm':
12150                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12151                                 namedclass = ANYOF_ALPHANUMERIC;
12152                             break;
12153                         case 'r':
12154                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12155                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12156                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12157                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12158                             break;
12159                         case 't':
12160                             if (memEQ(posixcc, "digi", 4)) /* digit */
12161                                 namedclass = ANYOF_DIGIT;
12162                             else if (memEQ(posixcc, "prin", 4)) /* print */
12163                                 namedclass = ANYOF_PRINT;
12164                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12165                                 namedclass = ANYOF_PUNCT;
12166                             break;
12167                         }
12168                         break;
12169                     case 6:
12170                         if (memEQ(posixcc, "xdigit", 6))
12171                             namedclass = ANYOF_XDIGIT;
12172                         break;
12173                     }
12174
12175                     if (namedclass == OOB_NAMEDCLASS)
12176                         vFAIL2utf8f(
12177                             "POSIX class [:%"UTF8f":] unknown",
12178                             UTF8fARG(UTF, t - s - 1, s + 1));
12179
12180                     /* The #defines are structured so each complement is +1 to
12181                      * the normal one */
12182                     if (complement) {
12183                         namedclass++;
12184                     }
12185                     assert (posixcc[skip] == ':');
12186                     assert (posixcc[skip+1] == ']');
12187                 } else if (!SIZE_ONLY) {
12188                     /* [[=foo=]] and [[.foo.]] are still future. */
12189
12190                     /* adjust RExC_parse so the warning shows after
12191                        the class closes */
12192                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12193                         RExC_parse++;
12194                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12195                 }
12196             } else {
12197                 /* Maternal grandfather:
12198                  * "[:" ending in ":" but not in ":]" */
12199                 if (strict) {
12200                     vFAIL("Unmatched '[' in POSIX class");
12201                 }
12202
12203                 /* Grandfather lone [:, [=, [. */
12204                 RExC_parse = s;
12205             }
12206         }
12207     }
12208
12209     return namedclass;
12210 }
12211
12212 STATIC bool
12213 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12214 {
12215     /* This applies some heuristics at the current parse position (which should
12216      * be at a '[') to see if what follows might be intended to be a [:posix:]
12217      * class.  It returns true if it really is a posix class, of course, but it
12218      * also can return true if it thinks that what was intended was a posix
12219      * class that didn't quite make it.
12220      *
12221      * It will return true for
12222      *      [:alphanumerics:
12223      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12224      *                         ')' indicating the end of the (?[
12225      *      [:any garbage including %^&$ punctuation:]
12226      *
12227      * This is designed to be called only from S_handle_regex_sets; it could be
12228      * easily adapted to be called from the spot at the beginning of regclass()
12229      * that checks to see in a normal bracketed class if the surrounding []
12230      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12231      * change long-standing behavior, so I (khw) didn't do that */
12232     char* p = RExC_parse + 1;
12233     char first_char = *p;
12234
12235     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12236
12237     assert(*(p - 1) == '[');
12238
12239     if (! POSIXCC(first_char)) {
12240         return FALSE;
12241     }
12242
12243     p++;
12244     while (p < RExC_end && isWORDCHAR(*p)) p++;
12245
12246     if (p >= RExC_end) {
12247         return FALSE;
12248     }
12249
12250     if (p - RExC_parse > 2    /* Got at least 1 word character */
12251         && (*p == first_char
12252             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12253     {
12254         return TRUE;
12255     }
12256
12257     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12258
12259     return (p
12260             && p - RExC_parse > 2 /* [:] evaluates to colon;
12261                                       [::] is a bad posix class. */
12262             && first_char == *(p - 1));
12263 }
12264
12265 STATIC regnode *
12266 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12267                    char * const oregcomp_parse)
12268 {
12269     /* Handle the (?[...]) construct to do set operations */
12270
12271     U8 curchar;
12272     UV start, end;      /* End points of code point ranges */
12273     SV* result_string;
12274     char *save_end, *save_parse;
12275     SV* final;
12276     STRLEN len;
12277     regnode* node;
12278     AV* stack;
12279     const bool save_fold = FOLD;
12280
12281     GET_RE_DEBUG_FLAGS_DECL;
12282
12283     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12284
12285     if (LOC) {
12286         vFAIL("(?[...]) not valid in locale");
12287     }
12288     RExC_uni_semantics = 1;
12289
12290     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12291      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12292      * call regclass to handle '[]' so as to not have to reinvent its parsing
12293      * rules here (throwing away the size it computes each time).  And, we exit
12294      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12295      * these things, we need to realize that something preceded by a backslash
12296      * is escaped, so we have to keep track of backslashes */
12297     if (SIZE_ONLY) {
12298         UV depth = 0; /* how many nested (?[...]) constructs */
12299
12300         Perl_ck_warner_d(aTHX_
12301             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12302             "The regex_sets feature is experimental" REPORT_LOCATION,
12303                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12304                 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12305
12306         while (RExC_parse < RExC_end) {
12307             SV* current = NULL;
12308             RExC_parse = regpatws(pRExC_state, RExC_parse,
12309                                 TRUE); /* means recognize comments */
12310             switch (*RExC_parse) {
12311                 case '?':
12312                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12313                     /* FALL THROUGH */
12314                 default:
12315                     break;
12316                 case '\\':
12317                     /* Skip the next byte (which could cause us to end up in
12318                      * the middle of a UTF-8 character, but since none of those
12319                      * are confusable with anything we currently handle in this
12320                      * switch (invariants all), it's safe.  We'll just hit the
12321                      * default: case next time and keep on incrementing until
12322                      * we find one of the invariants we do handle. */
12323                     RExC_parse++;
12324                     break;
12325                 case '[':
12326                 {
12327                     /* If this looks like it is a [:posix:] class, leave the
12328                      * parse pointer at the '[' to fool regclass() into
12329                      * thinking it is part of a '[[:posix:]]'.  That function
12330                      * will use strict checking to force a syntax error if it
12331                      * doesn't work out to a legitimate class */
12332                     bool is_posix_class
12333                                     = could_it_be_a_POSIX_class(pRExC_state);
12334                     if (! is_posix_class) {
12335                         RExC_parse++;
12336                     }
12337
12338                     /* regclass() can only return RESTART_UTF8 if multi-char
12339                        folds are allowed.  */
12340                     if (!regclass(pRExC_state, flagp,depth+1,
12341                                   is_posix_class, /* parse the whole char
12342                                                      class only if not a
12343                                                      posix class */
12344                                   FALSE, /* don't allow multi-char folds */
12345                                   TRUE, /* silence non-portable warnings. */
12346                                   &current))
12347                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12348                               (UV) *flagp);
12349
12350                     /* function call leaves parse pointing to the ']', except
12351                      * if we faked it */
12352                     if (is_posix_class) {
12353                         RExC_parse--;
12354                     }
12355
12356                     SvREFCNT_dec(current);   /* In case it returned something */
12357                     break;
12358                 }
12359
12360                 case ']':
12361                     if (depth--) break;
12362                     RExC_parse++;
12363                     if (RExC_parse < RExC_end
12364                         && *RExC_parse == ')')
12365                     {
12366                         node = reganode(pRExC_state, ANYOF, 0);
12367                         RExC_size += ANYOF_SKIP;
12368                         nextchar(pRExC_state);
12369                         Set_Node_Length(node,
12370                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12371                         return node;
12372                     }
12373                     goto no_close;
12374             }
12375             RExC_parse++;
12376         }
12377
12378         no_close:
12379         FAIL("Syntax error in (?[...])");
12380     }
12381
12382     /* Pass 2 only after this.  Everything in this construct is a
12383      * metacharacter.  Operands begin with either a '\' (for an escape
12384      * sequence), or a '[' for a bracketed character class.  Any other
12385      * character should be an operator, or parenthesis for grouping.  Both
12386      * types of operands are handled by calling regclass() to parse them.  It
12387      * is called with a parameter to indicate to return the computed inversion
12388      * list.  The parsing here is implemented via a stack.  Each entry on the
12389      * stack is a single character representing one of the operators, or the
12390      * '('; or else a pointer to an operand inversion list. */
12391
12392 #define IS_OPERAND(a)  (! SvIOK(a))
12393
12394     /* The stack starts empty.  It is a syntax error if the first thing parsed
12395      * is a binary operator; everything else is pushed on the stack.  When an
12396      * operand is parsed, the top of the stack is examined.  If it is a binary
12397      * operator, the item before it should be an operand, and both are replaced
12398      * by the result of doing that operation on the new operand and the one on
12399      * the stack.   Thus a sequence of binary operands is reduced to a single
12400      * one before the next one is parsed.
12401      *
12402      * A unary operator may immediately follow a binary in the input, for
12403      * example
12404      *      [a] + ! [b]
12405      * When an operand is parsed and the top of the stack is a unary operator,
12406      * the operation is performed, and then the stack is rechecked to see if
12407      * this new operand is part of a binary operation; if so, it is handled as
12408      * above.
12409      *
12410      * A '(' is simply pushed on the stack; it is valid only if the stack is
12411      * empty, or the top element of the stack is an operator or another '('
12412      * (for which the parenthesized expression will become an operand).  By the
12413      * time the corresponding ')' is parsed everything in between should have
12414      * been parsed and evaluated to a single operand (or else is a syntax
12415      * error), and is handled as a regular operand */
12416
12417     sv_2mortal((SV *)(stack = newAV()));
12418
12419     while (RExC_parse < RExC_end) {
12420         I32 top_index = av_tindex(stack);
12421         SV** top_ptr;
12422         SV* current = NULL;
12423
12424         /* Skip white space */
12425         RExC_parse = regpatws(pRExC_state, RExC_parse,
12426                                 TRUE); /* means recognize comments */
12427         if (RExC_parse >= RExC_end) {
12428             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12429         }
12430         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12431             break;
12432         }
12433
12434         switch (curchar) {
12435
12436             case '?':
12437                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12438                                                safely subtract 1 from
12439                                                RExC_parse in the next clause.
12440                                                If we have something on the
12441                                                stack, we have parsed something
12442                                              */
12443                     && UCHARAT(RExC_parse - 1) == '('
12444                     && RExC_parse < RExC_end)
12445                 {
12446                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12447                      * This happens when we have some thing like
12448                      *
12449                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12450                      *   ...
12451                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12452                      *
12453                      * Here we would be handling the interpolated
12454                      * '$thai_or_lao'.  We handle this by a recursive call to
12455                      * ourselves which returns the inversion list the
12456                      * interpolated expression evaluates to.  We use the flags
12457                      * from the interpolated pattern. */
12458                     U32 save_flags = RExC_flags;
12459                     const char * const save_parse = ++RExC_parse;
12460
12461                     parse_lparen_question_flags(pRExC_state);
12462
12463                     if (RExC_parse == save_parse  /* Makes sure there was at
12464                                                      least one flag (or this
12465                                                      embedding wasn't compiled)
12466                                                    */
12467                         || RExC_parse >= RExC_end - 4
12468                         || UCHARAT(RExC_parse) != ':'
12469                         || UCHARAT(++RExC_parse) != '('
12470                         || UCHARAT(++RExC_parse) != '?'
12471                         || UCHARAT(++RExC_parse) != '[')
12472                     {
12473
12474                         /* In combination with the above, this moves the
12475                          * pointer to the point just after the first erroneous
12476                          * character (or if there are no flags, to where they
12477                          * should have been) */
12478                         if (RExC_parse >= RExC_end - 4) {
12479                             RExC_parse = RExC_end;
12480                         }
12481                         else if (RExC_parse != save_parse) {
12482                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12483                         }
12484                         vFAIL("Expecting '(?flags:(?[...'");
12485                     }
12486                     RExC_parse++;
12487                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12488                                                     depth+1, oregcomp_parse);
12489
12490                     /* Here, 'current' contains the embedded expression's
12491                      * inversion list, and RExC_parse points to the trailing
12492                      * ']'; the next character should be the ')' which will be
12493                      * paired with the '(' that has been put on the stack, so
12494                      * the whole embedded expression reduces to '(operand)' */
12495                     RExC_parse++;
12496
12497                     RExC_flags = save_flags;
12498                     goto handle_operand;
12499                 }
12500                 /* FALL THROUGH */
12501
12502             default:
12503                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12504                 vFAIL("Unexpected character");
12505
12506             case '\\':
12507                 /* regclass() can only return RESTART_UTF8 if multi-char
12508                    folds are allowed.  */
12509                 if (!regclass(pRExC_state, flagp,depth+1,
12510                               TRUE, /* means parse just the next thing */
12511                               FALSE, /* don't allow multi-char folds */
12512                               FALSE, /* don't silence non-portable warnings.  */
12513                               &current))
12514                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12515                           (UV) *flagp);
12516                 /* regclass() will return with parsing just the \ sequence,
12517                  * leaving the parse pointer at the next thing to parse */
12518                 RExC_parse--;
12519                 goto handle_operand;
12520
12521             case '[':   /* Is a bracketed character class */
12522             {
12523                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12524
12525                 if (! is_posix_class) {
12526                     RExC_parse++;
12527                 }
12528
12529                 /* regclass() can only return RESTART_UTF8 if multi-char
12530                    folds are allowed.  */
12531                 if(!regclass(pRExC_state, flagp,depth+1,
12532                              is_posix_class, /* parse the whole char class
12533                                                 only if not a posix class */
12534                              FALSE, /* don't allow multi-char folds */
12535                              FALSE, /* don't silence non-portable warnings.  */
12536                              &current))
12537                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12538                           (UV) *flagp);
12539                 /* function call leaves parse pointing to the ']', except if we
12540                  * faked it */
12541                 if (is_posix_class) {
12542                     RExC_parse--;
12543                 }
12544
12545                 goto handle_operand;
12546             }
12547
12548             case '&':
12549             case '|':
12550             case '+':
12551             case '-':
12552             case '^':
12553                 if (top_index < 0
12554                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12555                     || ! IS_OPERAND(*top_ptr))
12556                 {
12557                     RExC_parse++;
12558                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12559                 }
12560                 av_push(stack, newSVuv(curchar));
12561                 break;
12562
12563             case '!':
12564                 av_push(stack, newSVuv(curchar));
12565                 break;
12566
12567             case '(':
12568                 if (top_index >= 0) {
12569                     top_ptr = av_fetch(stack, top_index, FALSE);
12570                     assert(top_ptr);
12571                     if (IS_OPERAND(*top_ptr)) {
12572                         RExC_parse++;
12573                         vFAIL("Unexpected '(' with no preceding operator");
12574                     }
12575                 }
12576                 av_push(stack, newSVuv(curchar));
12577                 break;
12578
12579             case ')':
12580             {
12581                 SV* lparen;
12582                 if (top_index < 1
12583                     || ! (current = av_pop(stack))
12584                     || ! IS_OPERAND(current)
12585                     || ! (lparen = av_pop(stack))
12586                     || IS_OPERAND(lparen)
12587                     || SvUV(lparen) != '(')
12588                 {
12589                     SvREFCNT_dec(current);
12590                     RExC_parse++;
12591                     vFAIL("Unexpected ')'");
12592                 }
12593                 top_index -= 2;
12594                 SvREFCNT_dec_NN(lparen);
12595
12596                 /* FALL THROUGH */
12597             }
12598
12599               handle_operand:
12600
12601                 /* Here, we have an operand to process, in 'current' */
12602
12603                 if (top_index < 0) {    /* Just push if stack is empty */
12604                     av_push(stack, current);
12605                 }
12606                 else {
12607                     SV* top = av_pop(stack);
12608                     SV *prev = NULL;
12609                     char current_operator;
12610
12611                     if (IS_OPERAND(top)) {
12612                         SvREFCNT_dec_NN(top);
12613                         SvREFCNT_dec_NN(current);
12614                         vFAIL("Operand with no preceding operator");
12615                     }
12616                     current_operator = (char) SvUV(top);
12617                     switch (current_operator) {
12618                         case '(':   /* Push the '(' back on followed by the new
12619                                        operand */
12620                             av_push(stack, top);
12621                             av_push(stack, current);
12622                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12623                                                    just after the 'break', so
12624                                                    it doesn't get wrongly freed
12625                                                  */
12626                             break;
12627
12628                         case '!':
12629                             _invlist_invert(current);
12630
12631                             /* Unlike binary operators, the top of the stack,
12632                              * now that this unary one has been popped off, may
12633                              * legally be an operator, and we now have operand
12634                              * for it. */
12635                             top_index--;
12636                             SvREFCNT_dec_NN(top);
12637                             goto handle_operand;
12638
12639                         case '&':
12640                             prev = av_pop(stack);
12641                             _invlist_intersection(prev,
12642                                                    current,
12643                                                    &current);
12644                             av_push(stack, current);
12645                             break;
12646
12647                         case '|':
12648                         case '+':
12649                             prev = av_pop(stack);
12650                             _invlist_union(prev, current, &current);
12651                             av_push(stack, current);
12652                             break;
12653
12654                         case '-':
12655                             prev = av_pop(stack);;
12656                             _invlist_subtract(prev, current, &current);
12657                             av_push(stack, current);
12658                             break;
12659
12660                         case '^':   /* The union minus the intersection */
12661                         {
12662                             SV* i = NULL;
12663                             SV* u = NULL;
12664                             SV* element;
12665
12666                             prev = av_pop(stack);
12667                             _invlist_union(prev, current, &u);
12668                             _invlist_intersection(prev, current, &i);
12669                             /* _invlist_subtract will overwrite current
12670                                 without freeing what it already contains */
12671                             element = current;
12672                             _invlist_subtract(u, i, &current);
12673                             av_push(stack, current);
12674                             SvREFCNT_dec_NN(i);
12675                             SvREFCNT_dec_NN(u);
12676                             SvREFCNT_dec_NN(element);
12677                             break;
12678                         }
12679
12680                         default:
12681                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12682                 }
12683                 SvREFCNT_dec_NN(top);
12684                 SvREFCNT_dec(prev);
12685             }
12686         }
12687
12688         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12689     }
12690
12691     if (av_tindex(stack) < 0   /* Was empty */
12692         || ((final = av_pop(stack)) == NULL)
12693         || ! IS_OPERAND(final)
12694         || av_tindex(stack) >= 0)  /* More left on stack */
12695     {
12696         vFAIL("Incomplete expression within '(?[ ])'");
12697     }
12698
12699     /* Here, 'final' is the resultant inversion list from evaluating the
12700      * expression.  Return it if so requested */
12701     if (return_invlist) {
12702         *return_invlist = final;
12703         return END;
12704     }
12705
12706     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12707      * expecting a string of ranges and individual code points */
12708     invlist_iterinit(final);
12709     result_string = newSVpvs("");
12710     while (invlist_iternext(final, &start, &end)) {
12711         if (start == end) {
12712             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12713         }
12714         else {
12715             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12716                                                      start,          end);
12717         }
12718     }
12719
12720     save_parse = RExC_parse;
12721     RExC_parse = SvPV(result_string, len);
12722     save_end = RExC_end;
12723     RExC_end = RExC_parse + len;
12724
12725     /* We turn off folding around the call, as the class we have constructed
12726      * already has all folding taken into consideration, and we don't want
12727      * regclass() to add to that */
12728     RExC_flags &= ~RXf_PMf_FOLD;
12729     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12730      */
12731     node = regclass(pRExC_state, flagp,depth+1,
12732                     FALSE, /* means parse the whole char class */
12733                     FALSE, /* don't allow multi-char folds */
12734                     TRUE, /* silence non-portable warnings.  The above may very
12735                              well have generated non-portable code points, but
12736                              they're valid on this machine */
12737                     NULL);
12738     if (!node)
12739         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12740                     PTR2UV(flagp));
12741     if (save_fold) {
12742         RExC_flags |= RXf_PMf_FOLD;
12743     }
12744     RExC_parse = save_parse + 1;
12745     RExC_end = save_end;
12746     SvREFCNT_dec_NN(final);
12747     SvREFCNT_dec_NN(result_string);
12748
12749     nextchar(pRExC_state);
12750     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12751     return node;
12752 }
12753 #undef IS_OPERAND
12754
12755 /* The names of properties whose definitions are not known at compile time are
12756  * stored in this SV, after a constant heading.  So if the length has been
12757  * changed since initialization, then there is a run-time definition. */
12758 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12759
12760 STATIC regnode *
12761 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12762                  const bool stop_at_1,  /* Just parse the next thing, don't
12763                                            look for a full character class */
12764                  bool allow_multi_folds,
12765                  const bool silence_non_portable,   /* Don't output warnings
12766                                                        about too large
12767                                                        characters */
12768                  SV** ret_invlist)  /* Return an inversion list, not a node */
12769 {
12770     /* parse a bracketed class specification.  Most of these will produce an
12771      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12772      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12773      * under /i with multi-character folds: it will be rewritten following the
12774      * paradigm of this example, where the <multi-fold>s are characters which
12775      * fold to multiple character sequences:
12776      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12777      * gets effectively rewritten as:
12778      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12779      * reg() gets called (recursively) on the rewritten version, and this
12780      * function will return what it constructs.  (Actually the <multi-fold>s
12781      * aren't physically removed from the [abcdefghi], it's just that they are
12782      * ignored in the recursion by means of a flag:
12783      * <RExC_in_multi_char_class>.)
12784      *
12785      * ANYOF nodes contain a bit map for the first 256 characters, with the
12786      * corresponding bit set if that character is in the list.  For characters
12787      * above 255, a range list or swash is used.  There are extra bits for \w,
12788      * etc. in locale ANYOFs, as what these match is not determinable at
12789      * compile time
12790      *
12791      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12792      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12793      */
12794
12795     dVAR;
12796     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12797     IV range = 0;
12798     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12799     regnode *ret;
12800     STRLEN numlen;
12801     IV namedclass = OOB_NAMEDCLASS;
12802     char *rangebegin = NULL;
12803     bool need_class = 0;
12804     SV *listsv = NULL;
12805     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12806                                       than just initialized.  */
12807     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12808     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12809                                extended beyond the Latin1 range */
12810     UV element_count = 0;   /* Number of distinct elements in the class.
12811                                Optimizations may be possible if this is tiny */
12812     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12813                                        character; used under /i */
12814     UV n;
12815     char * stop_ptr = RExC_end;    /* where to stop parsing */
12816     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12817                                                    space? */
12818     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12819
12820     /* Unicode properties are stored in a swash; this holds the current one
12821      * being parsed.  If this swash is the only above-latin1 component of the
12822      * character class, an optimization is to pass it directly on to the
12823      * execution engine.  Otherwise, it is set to NULL to indicate that there
12824      * are other things in the class that have to be dealt with at execution
12825      * time */
12826     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12827
12828     /* Set if a component of this character class is user-defined; just passed
12829      * on to the engine */
12830     bool has_user_defined_property = FALSE;
12831
12832     /* inversion list of code points this node matches only when the target
12833      * string is in UTF-8.  (Because is under /d) */
12834     SV* depends_list = NULL;
12835
12836     /* inversion list of code points this node matches.  For much of the
12837      * function, it includes only those that match regardless of the utf8ness
12838      * of the target string */
12839     SV* cp_list = NULL;
12840
12841 #ifdef EBCDIC
12842     /* In a range, counts how many 0-2 of the ends of it came from literals,
12843      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12844     UV literal_endpoint = 0;
12845 #endif
12846     bool invert = FALSE;    /* Is this class to be complemented */
12847
12848     /* Is there any thing like \W or [:^digit:] that matches above the legal
12849      * Unicode range? */
12850     bool runtime_posix_matches_above_Unicode = FALSE;
12851
12852     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12853         case we need to change the emitted regop to an EXACT. */
12854     const char * orig_parse = RExC_parse;
12855     const SSize_t orig_size = RExC_size;
12856     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12857     GET_RE_DEBUG_FLAGS_DECL;
12858
12859     PERL_ARGS_ASSERT_REGCLASS;
12860 #ifndef DEBUGGING
12861     PERL_UNUSED_ARG(depth);
12862 #endif
12863
12864     DEBUG_PARSE("clas");
12865
12866     /* Assume we are going to generate an ANYOF node. */
12867     ret = reganode(pRExC_state, ANYOF, 0);
12868
12869     if (SIZE_ONLY) {
12870         RExC_size += ANYOF_SKIP;
12871         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12872     }
12873     else {
12874         ANYOF_FLAGS(ret) = 0;
12875
12876         RExC_emit += ANYOF_SKIP;
12877         if (LOC) {
12878             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12879         }
12880         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12881         initial_listsv_len = SvCUR(listsv);
12882         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12883     }
12884
12885     if (skip_white) {
12886         RExC_parse = regpatws(pRExC_state, RExC_parse,
12887                               FALSE /* means don't recognize comments */);
12888     }
12889
12890     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12891         RExC_parse++;
12892         invert = TRUE;
12893         allow_multi_folds = FALSE;
12894         RExC_naughty++;
12895         if (skip_white) {
12896             RExC_parse = regpatws(pRExC_state, RExC_parse,
12897                                   FALSE /* means don't recognize comments */);
12898         }
12899     }
12900
12901     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12902     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12903         const char *s = RExC_parse;
12904         const char  c = *s++;
12905
12906         while (isWORDCHAR(*s))
12907             s++;
12908         if (*s && c == *s && s[1] == ']') {
12909             SAVEFREESV(RExC_rx_sv);
12910             ckWARN3reg(s+2,
12911                        "POSIX syntax [%c %c] belongs inside character classes",
12912                        c, c);
12913             (void)ReREFCNT_inc(RExC_rx_sv);
12914         }
12915     }
12916
12917     /* If the caller wants us to just parse a single element, accomplish this
12918      * by faking the loop ending condition */
12919     if (stop_at_1 && RExC_end > RExC_parse) {
12920         stop_ptr = RExC_parse + 1;
12921     }
12922
12923     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12924     if (UCHARAT(RExC_parse) == ']')
12925         goto charclassloop;
12926
12927 parseit:
12928     while (1) {
12929         if  (RExC_parse >= stop_ptr) {
12930             break;
12931         }
12932
12933         if (skip_white) {
12934             RExC_parse = regpatws(pRExC_state, RExC_parse,
12935                                   FALSE /* means don't recognize comments */);
12936         }
12937
12938         if  (UCHARAT(RExC_parse) == ']') {
12939             break;
12940         }
12941
12942     charclassloop:
12943
12944         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12945         save_value = value;
12946         save_prevvalue = prevvalue;
12947
12948         if (!range) {
12949             rangebegin = RExC_parse;
12950             element_count++;
12951         }
12952         if (UTF) {
12953             value = utf8n_to_uvchr((U8*)RExC_parse,
12954                                    RExC_end - RExC_parse,
12955                                    &numlen, UTF8_ALLOW_DEFAULT);
12956             RExC_parse += numlen;
12957         }
12958         else
12959             value = UCHARAT(RExC_parse++);
12960
12961         if (value == '['
12962             && RExC_parse < RExC_end
12963             && POSIXCC(UCHARAT(RExC_parse)))
12964         {
12965             namedclass = regpposixcc(pRExC_state, value, strict);
12966         }
12967         else if (value == '\\') {
12968             if (UTF) {
12969                 value = utf8n_to_uvchr((U8*)RExC_parse,
12970                                    RExC_end - RExC_parse,
12971                                    &numlen, UTF8_ALLOW_DEFAULT);
12972                 RExC_parse += numlen;
12973             }
12974             else
12975                 value = UCHARAT(RExC_parse++);
12976
12977             /* Some compilers cannot handle switching on 64-bit integer
12978              * values, therefore value cannot be an UV.  Yes, this will
12979              * be a problem later if we want switch on Unicode.
12980              * A similar issue a little bit later when switching on
12981              * namedclass. --jhi */
12982
12983             /* If the \ is escaping white space when white space is being
12984              * skipped, it means that that white space is wanted literally, and
12985              * is already in 'value'.  Otherwise, need to translate the escape
12986              * into what it signifies. */
12987             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12988
12989             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12990             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12991             case 's':   namedclass = ANYOF_SPACE;       break;
12992             case 'S':   namedclass = ANYOF_NSPACE;      break;
12993             case 'd':   namedclass = ANYOF_DIGIT;       break;
12994             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12995             case 'v':   namedclass = ANYOF_VERTWS;      break;
12996             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12997             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12998             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12999             case 'N':  /* Handle \N{NAME} in class */
13000                 {
13001                     /* We only pay attention to the first char of
13002                     multichar strings being returned. I kinda wonder
13003                     if this makes sense as it does change the behaviour
13004                     from earlier versions, OTOH that behaviour was broken
13005                     as well. */
13006                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13007                                       TRUE, /* => charclass */
13008                                       strict))
13009                     {
13010                         if (*flagp & RESTART_UTF8)
13011                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13012                         goto parseit;
13013                     }
13014                 }
13015                 break;
13016             case 'p':
13017             case 'P':
13018                 {
13019                 char *e;
13020
13021                 /* We will handle any undefined properties ourselves */
13022                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13023                                        /* And we actually would prefer to get
13024                                         * the straight inversion list of the
13025                                         * swash, since we will be accessing it
13026                                         * anyway, to save a little time */
13027                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13028
13029                 if (RExC_parse >= RExC_end)
13030                     vFAIL2("Empty \\%c{}", (U8)value);
13031                 if (*RExC_parse == '{') {
13032                     const U8 c = (U8)value;
13033                     e = strchr(RExC_parse++, '}');
13034                     if (!e)
13035                         vFAIL2("Missing right brace on \\%c{}", c);
13036                     while (isSPACE(UCHARAT(RExC_parse)))
13037                         RExC_parse++;
13038                     if (e == RExC_parse)
13039                         vFAIL2("Empty \\%c{}", c);
13040                     n = e - RExC_parse;
13041                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13042                         n--;
13043                 }
13044                 else {
13045                     e = RExC_parse;
13046                     n = 1;
13047                 }
13048                 if (!SIZE_ONLY) {
13049                     SV* invlist;
13050                     char* formatted;
13051                     char* name;
13052
13053                     if (UCHARAT(RExC_parse) == '^') {
13054                          RExC_parse++;
13055                          n--;
13056                          /* toggle.  (The rhs xor gets the single bit that
13057                           * differs between P and p; the other xor inverts just
13058                           * that bit) */
13059                          value ^= 'P' ^ 'p';
13060
13061                          while (isSPACE(UCHARAT(RExC_parse))) {
13062                               RExC_parse++;
13063                               n--;
13064                          }
13065                     }
13066                     /* Try to get the definition of the property into
13067                      * <invlist>.  If /i is in effect, the effective property
13068                      * will have its name be <__NAME_i>.  The design is
13069                      * discussed in commit
13070                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13071                     formatted = Perl_form(aTHX_
13072                                           "%s%.*s%s\n",
13073                                           (FOLD) ? "__" : "",
13074                                           (int)n,
13075                                           RExC_parse,
13076                                           (FOLD) ? "_i" : ""
13077                                 );
13078                     name = savepvn(formatted, strlen(formatted));
13079
13080                     /* Look up the property name, and get its swash and
13081                      * inversion list, if the property is found  */
13082                     if (swash) {
13083                         SvREFCNT_dec_NN(swash);
13084                     }
13085                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13086                                              1, /* binary */
13087                                              0, /* not tr/// */
13088                                              NULL, /* No inversion list */
13089                                              &swash_init_flags
13090                                             );
13091                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13092                         if (swash) {
13093                             SvREFCNT_dec_NN(swash);
13094                             swash = NULL;
13095                         }
13096
13097                         /* Here didn't find it.  It could be a user-defined
13098                          * property that will be available at run-time.  If we
13099                          * accept only compile-time properties, is an error;
13100                          * otherwise add it to the list for run-time look up */
13101                         if (ret_invlist) {
13102                             RExC_parse = e + 1;
13103                             vFAIL2utf8f(
13104                                 "Property '%"UTF8f"' is unknown",
13105                                 UTF8fARG(UTF, n, name));
13106                         }
13107                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13108                                         (value == 'p' ? '+' : '!'),
13109                                         UTF8fARG(UTF, n, name));
13110                         has_user_defined_property = TRUE;
13111
13112                         /* We don't know yet, so have to assume that the
13113                          * property could match something in the Latin1 range,
13114                          * hence something that isn't utf8.  Note that this
13115                          * would cause things in <depends_list> to match
13116                          * inappropriately, except that any \p{}, including
13117                          * this one forces Unicode semantics, which means there
13118                          * is <no depends_list> */
13119                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13120                     }
13121                     else {
13122
13123                         /* Here, did get the swash and its inversion list.  If
13124                          * the swash is from a user-defined property, then this
13125                          * whole character class should be regarded as such */
13126                         has_user_defined_property =
13127                                     (swash_init_flags
13128                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13129
13130                         /* Invert if asking for the complement */
13131                         if (value == 'P') {
13132                             _invlist_union_complement_2nd(properties,
13133                                                           invlist,
13134                                                           &properties);
13135
13136                             /* The swash can't be used as-is, because we've
13137                              * inverted things; delay removing it to here after
13138                              * have copied its invlist above */
13139                             SvREFCNT_dec_NN(swash);
13140                             swash = NULL;
13141                         }
13142                         else {
13143                             _invlist_union(properties, invlist, &properties);
13144                         }
13145                     }
13146                     Safefree(name);
13147                 }
13148                 RExC_parse = e + 1;
13149                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13150                                                 named */
13151
13152                 /* \p means they want Unicode semantics */
13153                 RExC_uni_semantics = 1;
13154                 }
13155                 break;
13156             case 'n':   value = '\n';                   break;
13157             case 'r':   value = '\r';                   break;
13158             case 't':   value = '\t';                   break;
13159             case 'f':   value = '\f';                   break;
13160             case 'b':   value = '\b';                   break;
13161             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13162             case 'a':   value = '\a';                   break;
13163             case 'o':
13164                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13165                 {
13166                     const char* error_msg;
13167                     bool valid = grok_bslash_o(&RExC_parse,
13168                                                &value,
13169                                                &error_msg,
13170                                                SIZE_ONLY,   /* warnings in pass
13171                                                                1 only */
13172                                                strict,
13173                                                silence_non_portable,
13174                                                UTF);
13175                     if (! valid) {
13176                         vFAIL(error_msg);
13177                     }
13178                 }
13179                 if (PL_encoding && value < 0x100) {
13180                     goto recode_encoding;
13181                 }
13182                 break;
13183             case 'x':
13184                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13185                 {
13186                     const char* error_msg;
13187                     bool valid = grok_bslash_x(&RExC_parse,
13188                                                &value,
13189                                                &error_msg,
13190                                                TRUE, /* Output warnings */
13191                                                strict,
13192                                                silence_non_portable,
13193                                                UTF);
13194                     if (! valid) {
13195                         vFAIL(error_msg);
13196                     }
13197                 }
13198                 if (PL_encoding && value < 0x100)
13199                     goto recode_encoding;
13200                 break;
13201             case 'c':
13202                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13203                 break;
13204             case '0': case '1': case '2': case '3': case '4':
13205             case '5': case '6': case '7':
13206                 {
13207                     /* Take 1-3 octal digits */
13208                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13209                     numlen = (strict) ? 4 : 3;
13210                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13211                     RExC_parse += numlen;
13212                     if (numlen != 3) {
13213                         if (strict) {
13214                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13215                             vFAIL("Need exactly 3 octal digits");
13216                         }
13217                         else if (! SIZE_ONLY /* like \08, \178 */
13218                                  && numlen < 3
13219                                  && RExC_parse < RExC_end
13220                                  && isDIGIT(*RExC_parse)
13221                                  && ckWARN(WARN_REGEXP))
13222                         {
13223                             SAVEFREESV(RExC_rx_sv);
13224                             reg_warn_non_literal_string(
13225                                  RExC_parse + 1,
13226                                  form_short_octal_warning(RExC_parse, numlen));
13227                             (void)ReREFCNT_inc(RExC_rx_sv);
13228                         }
13229                     }
13230                     if (PL_encoding && value < 0x100)
13231                         goto recode_encoding;
13232                     break;
13233                 }
13234             recode_encoding:
13235                 if (! RExC_override_recoding) {
13236                     SV* enc = PL_encoding;
13237                     value = reg_recode((const char)(U8)value, &enc);
13238                     if (!enc) {
13239                         if (strict) {
13240                             vFAIL("Invalid escape in the specified encoding");
13241                         }
13242                         else if (SIZE_ONLY) {
13243                             ckWARNreg(RExC_parse,
13244                                   "Invalid escape in the specified encoding");
13245                         }
13246                     }
13247                     break;
13248                 }
13249             default:
13250                 /* Allow \_ to not give an error */
13251                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13252                     if (strict) {
13253                         vFAIL2("Unrecognized escape \\%c in character class",
13254                                (int)value);
13255                     }
13256                     else {
13257                         SAVEFREESV(RExC_rx_sv);
13258                         ckWARN2reg(RExC_parse,
13259                             "Unrecognized escape \\%c in character class passed through",
13260                             (int)value);
13261                         (void)ReREFCNT_inc(RExC_rx_sv);
13262                     }
13263                 }
13264                 break;
13265             }   /* End of switch on char following backslash */
13266         } /* end of handling backslash escape sequences */
13267 #ifdef EBCDIC
13268         else
13269             literal_endpoint++;
13270 #endif
13271
13272         /* Here, we have the current token in 'value' */
13273
13274         /* What matches in a locale is not known until runtime.  This includes
13275          * what the Posix classes (like \w, [:space:]) match.  Room must be
13276          * reserved (one time per outer bracketed class) to store such classes,
13277          * either if Perl is compiled so that locale nodes always should have
13278          * this space, or if there is such posix class info to be stored.  The
13279          * space will contain a bit for each named class that is to be matched
13280          * against.  This isn't needed for \p{} and pseudo-classes, as they are
13281          * not affected by locale, and hence are dealt with separately */
13282         if (LOC
13283             && ! need_class
13284             && (ANYOF_LOCALE == ANYOF_POSIXL
13285                 || (namedclass > OOB_NAMEDCLASS
13286                     && namedclass < ANYOF_POSIXL_MAX)))
13287         {
13288             need_class = 1;
13289             if (SIZE_ONLY) {
13290                 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13291             }
13292             else {
13293                 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13294             }
13295             ANYOF_POSIXL_ZERO(ret);
13296             ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13297         }
13298
13299         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13300             U8 classnum;
13301
13302             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13303              * literal, as is the character that began the false range, i.e.
13304              * the 'a' in the examples */
13305             if (range) {
13306                 if (!SIZE_ONLY) {
13307                     const int w = (RExC_parse >= rangebegin)
13308                                   ? RExC_parse - rangebegin
13309                                   : 0;
13310                     if (strict) {
13311                         vFAIL2utf8f(
13312                             "False [] range \"%"UTF8f"\"",
13313                             UTF8fARG(UTF, w, rangebegin));
13314                     }
13315                     else {
13316                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13317                         ckWARN2reg(RExC_parse,
13318                             "False [] range \"%"UTF8f"\"",
13319                             UTF8fARG(UTF, w, rangebegin));
13320                         (void)ReREFCNT_inc(RExC_rx_sv);
13321                         cp_list = add_cp_to_invlist(cp_list, '-');
13322                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
13323                     }
13324                 }
13325
13326                 range = 0; /* this was not a true range */
13327                 element_count += 2; /* So counts for three values */
13328             }
13329
13330             classnum = namedclass_to_classnum(namedclass);
13331
13332             if (LOC && namedclass < ANYOF_POSIXL_MAX
13333 #ifndef HAS_ISASCII
13334                 && classnum != _CC_ASCII
13335 #endif
13336 #ifndef HAS_ISBLANK
13337                 && classnum != _CC_BLANK
13338 #endif
13339             ) {
13340                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13341                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13342                                                             ? -1
13343                                                             : 1)))
13344                 {
13345                     posixl_matches_all = TRUE;
13346                     break;
13347                 }
13348                 ANYOF_POSIXL_SET(ret, namedclass);
13349             }
13350             /* XXX After have made all the posix classes known at compile time
13351              * we can move the LOC handling below to above */
13352
13353             if (! SIZE_ONLY) {
13354                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13355                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13356
13357                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
13358                          * /l make a difference in what these match.  There
13359                          * would be problems if these characters had folds
13360                          * other than themselves, as cp_list is subject to
13361                          * folding. */
13362                         if (classnum != _CC_VERTSPACE) {
13363                             assert(   namedclass == ANYOF_HORIZWS
13364                                    || namedclass == ANYOF_NHORIZWS);
13365
13366                             /* It turns out that \h is just a synonym for
13367                              * XPosixBlank */
13368                             classnum = _CC_BLANK;
13369                         }
13370
13371                         _invlist_union_maybe_complement_2nd(
13372                                 cp_list,
13373                                 PL_XPosix_ptrs[classnum],
13374                                 cBOOL(namedclass % 2), /* Complement if odd
13375                                                           (NHORIZWS, NVERTWS)
13376                                                         */
13377                                 &cp_list);
13378                     }
13379                 }
13380                 else if (classnum == _CC_ASCII) {
13381 #ifdef HAS_ISASCII
13382                     if (LOC) {
13383                         ANYOF_POSIXL_SET(ret, namedclass);
13384                     }
13385                     else
13386 #endif  /* Not isascii(); just use the hard-coded definition for it */
13387                         _invlist_union_maybe_complement_2nd(
13388                                 posixes,
13389                                 PL_Posix_ptrs[_CC_ASCII],
13390                                 cBOOL(namedclass % 2), /* Complement if odd
13391                                                           (NASCII) */
13392                                 &posixes);
13393                 }
13394                 else {  /* Garden variety class */
13395
13396                     /* The ascii range inversion list */
13397                     SV* ascii_source = PL_Posix_ptrs[classnum];
13398
13399                     /* The full Latin1 range inversion list */
13400                     SV* l1_source = PL_L1Posix_ptrs[classnum];
13401
13402                     /* This code is structured into two major clauses.  The
13403                      * first is for classes whose complete definitions may not
13404                      * already be known.  If not, the Latin1 definition
13405                      * (guaranteed to already known) is used plus code is
13406                      * generated to load the rest at run-time (only if needed).
13407                      * If the complete definition is known, it drops down to
13408                      * the second clause, where the complete definition is
13409                      * known */
13410
13411                     if (classnum < _FIRST_NON_SWASH_CC) {
13412
13413                         /* Here, the class has a swash, which may or not
13414                          * already be loaded */
13415
13416                         /* The name of the property to use to match the full
13417                          * eXtended Unicode range swash for this character
13418                          * class */
13419                         const char *Xname = swash_property_names[classnum];
13420
13421                         /* If returning the inversion list, we can't defer
13422                          * getting this until runtime */
13423                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
13424                             PL_utf8_swash_ptrs[classnum] =
13425                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
13426                                              1, /* binary */
13427                                              0, /* not tr/// */
13428                                              NULL, /* No inversion list */
13429                                              NULL  /* No flags */
13430                                             );
13431                             assert(PL_utf8_swash_ptrs[classnum]);
13432                         }
13433                         if ( !  PL_utf8_swash_ptrs[classnum]) {
13434                             if (namedclass % 2 == 0) { /* A non-complemented
13435                                                           class */
13436                                 /* If not /a matching, there are code points we
13437                                  * don't know at compile time.  Arrange for the
13438                                  * unknown matches to be loaded at run-time, if
13439                                  * needed */
13440                                 if (! AT_LEAST_ASCII_RESTRICTED) {
13441                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13442                                                                  Xname);
13443                                 }
13444                                 if (LOC) {  /* Under locale, set run-time
13445                                                lookup */
13446                                     ANYOF_POSIXL_SET(ret, namedclass);
13447                                 }
13448                                 else {
13449                                     /* Add the current class's code points to
13450                                      * the running total */
13451                                     _invlist_union(posixes,
13452                                                    (AT_LEAST_ASCII_RESTRICTED)
13453                                                         ? ascii_source
13454                                                         : l1_source,
13455                                                    &posixes);
13456                                 }
13457                             }
13458                             else {  /* A complemented class */
13459                                 if (AT_LEAST_ASCII_RESTRICTED) {
13460                                     /* Under /a should match everything above
13461                                      * ASCII, plus the complement of the set's
13462                                      * ASCII matches */
13463                                     _invlist_union_complement_2nd(posixes,
13464                                                                   ascii_source,
13465                                                                   &posixes);
13466                                 }
13467                                 else {
13468                                     /* Arrange for the unknown matches to be
13469                                      * loaded at run-time, if needed */
13470                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13471                                                                  Xname);
13472                                     runtime_posix_matches_above_Unicode = TRUE;
13473                                     if (LOC) {
13474                                         ANYOF_POSIXL_SET(ret, namedclass);
13475                                     }
13476                                     else {
13477
13478                                         /* We want to match everything in
13479                                          * Latin1, except those things that
13480                                          * l1_source matches */
13481                                         SV* scratch_list = NULL;
13482                                         _invlist_subtract(PL_Latin1, l1_source,
13483                                                           &scratch_list);
13484
13485                                         /* Add the list from this class to the
13486                                          * running total */
13487                                         if (! posixes) {
13488                                             posixes = scratch_list;
13489                                         }
13490                                         else {
13491                                             _invlist_union(posixes,
13492                                                            scratch_list,
13493                                                            &posixes);
13494                                             SvREFCNT_dec_NN(scratch_list);
13495                                         }
13496                                         if (DEPENDS_SEMANTICS) {
13497                                             ANYOF_FLAGS(ret)
13498                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
13499                                         }
13500                                     }
13501                                 }
13502                             }
13503                             goto namedclass_done;
13504                         }
13505
13506                         /* Here, there is a swash loaded for the class.  If no
13507                          * inversion list for it yet, get it */
13508                         if (! PL_XPosix_ptrs[classnum]) {
13509                             PL_XPosix_ptrs[classnum]
13510                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13511                         }
13512                     }
13513
13514                     /* Here there is an inversion list already loaded for the
13515                      * entire class */
13516
13517                     if (namedclass % 2 == 0) {  /* A non-complemented class,
13518                                                    like ANYOF_PUNCT */
13519                         if (! LOC) {
13520                             /* For non-locale, just add it to any existing list
13521                              * */
13522                             _invlist_union(posixes,
13523                                            (AT_LEAST_ASCII_RESTRICTED)
13524                                                ? ascii_source
13525                                                : PL_XPosix_ptrs[classnum],
13526                                            &posixes);
13527                         }
13528                         else {  /* Locale */
13529                             SV* scratch_list = NULL;
13530
13531                             /* For above Latin1 code points, we use the full
13532                              * Unicode range */
13533                             _invlist_intersection(PL_AboveLatin1,
13534                                                   PL_XPosix_ptrs[classnum],
13535                                                   &scratch_list);
13536                             /* And set the output to it, adding instead if
13537                              * there already is an output.  Checking if
13538                              * 'posixes' is NULL first saves an extra clone.
13539                              * Its reference count will be decremented at the
13540                              * next union, etc, or if this is the only
13541                              * instance, at the end of the routine */
13542                             if (! posixes) {
13543                                 posixes = scratch_list;
13544                             }
13545                             else {
13546                                 _invlist_union(posixes, scratch_list, &posixes);
13547                                 SvREFCNT_dec_NN(scratch_list);
13548                             }
13549
13550 #ifndef HAS_ISBLANK
13551                             if (namedclass != ANYOF_BLANK) {
13552 #endif
13553                                 /* Set this class in the node for runtime
13554                                  * matching */
13555                                 ANYOF_POSIXL_SET(ret, namedclass);
13556 #ifndef HAS_ISBLANK
13557                             }
13558                             else {
13559                                 /* No isblank(), use the hard-coded ASCII-range
13560                                  * blanks, adding them to the running total. */
13561
13562                                 _invlist_union(posixes, ascii_source, &posixes);
13563                             }
13564 #endif
13565                         }
13566                     }
13567                     else {  /* A complemented class, like ANYOF_NPUNCT */
13568                         if (! LOC) {
13569                             _invlist_union_complement_2nd(
13570                                                 posixes,
13571                                                 (AT_LEAST_ASCII_RESTRICTED)
13572                                                     ? ascii_source
13573                                                     : PL_XPosix_ptrs[classnum],
13574                                                 &posixes);
13575                             /* Under /d, everything in the upper half of the
13576                              * Latin1 range matches this complement */
13577                             if (DEPENDS_SEMANTICS) {
13578                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13579                             }
13580                         }
13581                         else {  /* Locale */
13582                             SV* scratch_list = NULL;
13583                             _invlist_subtract(PL_AboveLatin1,
13584                                               PL_XPosix_ptrs[classnum],
13585                                               &scratch_list);
13586                             if (! posixes) {
13587                                 posixes = scratch_list;
13588                             }
13589                             else {
13590                                 _invlist_union(posixes, scratch_list, &posixes);
13591                                 SvREFCNT_dec_NN(scratch_list);
13592                             }
13593 #ifndef HAS_ISBLANK
13594                             if (namedclass != ANYOF_NBLANK) {
13595 #endif
13596                                 ANYOF_POSIXL_SET(ret, namedclass);
13597 #ifndef HAS_ISBLANK
13598                             }
13599                             else {
13600                                 /* Get the list of all code points in Latin1
13601                                  * that are not ASCII blanks, and add them to
13602                                  * the running total */
13603                                 _invlist_subtract(PL_Latin1, ascii_source,
13604                                                   &scratch_list);
13605                                 _invlist_union(posixes, scratch_list, &posixes);
13606                                 SvREFCNT_dec_NN(scratch_list);
13607                             }
13608 #endif
13609                         }
13610                     }
13611                 }
13612               namedclass_done:
13613                 continue;   /* Go get next character */
13614             }
13615         } /* end of namedclass \blah */
13616
13617         /* Here, we have a single value.  If 'range' is set, it is the ending
13618          * of a range--check its validity.  Later, we will handle each
13619          * individual code point in the range.  If 'range' isn't set, this
13620          * could be the beginning of a range, so check for that by looking
13621          * ahead to see if the next real character to be processed is the range
13622          * indicator--the minus sign */
13623
13624         if (skip_white) {
13625             RExC_parse = regpatws(pRExC_state, RExC_parse,
13626                                 FALSE /* means don't recognize comments */);
13627         }
13628
13629         if (range) {
13630             if (prevvalue > value) /* b-a */ {
13631                 const int w = RExC_parse - rangebegin;
13632                 vFAIL2utf8f(
13633                     "Invalid [] range \"%"UTF8f"\"",
13634                     UTF8fARG(UTF, w, rangebegin));
13635                 range = 0; /* not a valid range */
13636             }
13637         }
13638         else {
13639             prevvalue = value; /* save the beginning of the potential range */
13640             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13641                 && *RExC_parse == '-')
13642             {
13643                 char* next_char_ptr = RExC_parse + 1;
13644                 if (skip_white) {   /* Get the next real char after the '-' */
13645                     next_char_ptr = regpatws(pRExC_state,
13646                                              RExC_parse + 1,
13647                                              FALSE); /* means don't recognize
13648                                                         comments */
13649                 }
13650
13651                 /* If the '-' is at the end of the class (just before the ']',
13652                  * it is a literal minus; otherwise it is a range */
13653                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13654                     RExC_parse = next_char_ptr;
13655
13656                     /* a bad range like \w-, [:word:]- ? */
13657                     if (namedclass > OOB_NAMEDCLASS) {
13658                         if (strict || ckWARN(WARN_REGEXP)) {
13659                             const int w =
13660                                 RExC_parse >= rangebegin ?
13661                                 RExC_parse - rangebegin : 0;
13662                             if (strict) {
13663                                 vFAIL4("False [] range \"%*.*s\"",
13664                                     w, w, rangebegin);
13665                             }
13666                             else {
13667                                 vWARN4(RExC_parse,
13668                                     "False [] range \"%*.*s\"",
13669                                     w, w, rangebegin);
13670                             }
13671                         }
13672                         if (!SIZE_ONLY) {
13673                             cp_list = add_cp_to_invlist(cp_list, '-');
13674                         }
13675                         element_count++;
13676                     } else
13677                         range = 1;      /* yeah, it's a range! */
13678                     continue;   /* but do it the next time */
13679                 }
13680             }
13681         }
13682
13683         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13684          * if not */
13685
13686         /* non-Latin1 code point implies unicode semantics.  Must be set in
13687          * pass1 so is there for the whole of pass 2 */
13688         if (value > 255) {
13689             RExC_uni_semantics = 1;
13690         }
13691
13692         /* Ready to process either the single value, or the completed range.
13693          * For single-valued non-inverted ranges, we consider the possibility
13694          * of multi-char folds.  (We made a conscious decision to not do this
13695          * for the other cases because it can often lead to non-intuitive
13696          * results.  For example, you have the peculiar case that:
13697          *  "s s" =~ /^[^\xDF]+$/i => Y
13698          *  "ss"  =~ /^[^\xDF]+$/i => N
13699          *
13700          * See [perl #89750] */
13701         if (FOLD && allow_multi_folds && value == prevvalue) {
13702             if (value == LATIN_SMALL_LETTER_SHARP_S
13703                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13704                                                         value)))
13705             {
13706                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13707
13708                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13709                 STRLEN foldlen;
13710
13711                 UV folded = _to_uni_fold_flags(
13712                                 value,
13713                                 foldbuf,
13714                                 &foldlen,
13715                                 FOLD_FLAGS_FULL
13716                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13717                                             : (ASCII_FOLD_RESTRICTED)
13718                                               ? FOLD_FLAGS_NOMIX_ASCII
13719                                               : 0)
13720                                 );
13721
13722                 /* Here, <folded> should be the first character of the
13723                  * multi-char fold of <value>, with <foldbuf> containing the
13724                  * whole thing.  But, if this fold is not allowed (because of
13725                  * the flags), <fold> will be the same as <value>, and should
13726                  * be processed like any other character, so skip the special
13727                  * handling */
13728                 if (folded != value) {
13729
13730                     /* Skip if we are recursed, currently parsing the class
13731                      * again.  Otherwise add this character to the list of
13732                      * multi-char folds. */
13733                     if (! RExC_in_multi_char_class) {
13734                         AV** this_array_ptr;
13735                         AV* this_array;
13736                         STRLEN cp_count = utf8_length(foldbuf,
13737                                                       foldbuf + foldlen);
13738                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13739
13740                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13741
13742
13743                         if (! multi_char_matches) {
13744                             multi_char_matches = newAV();
13745                         }
13746
13747                         /* <multi_char_matches> is actually an array of arrays.
13748                          * There will be one or two top-level elements: [2],
13749                          * and/or [3].  The [2] element is an array, each
13750                          * element thereof is a character which folds to TWO
13751                          * characters; [3] is for folds to THREE characters.
13752                          * (Unicode guarantees a maximum of 3 characters in any
13753                          * fold.)  When we rewrite the character class below,
13754                          * we will do so such that the longest folds are
13755                          * written first, so that it prefers the longest
13756                          * matching strings first.  This is done even if it
13757                          * turns out that any quantifier is non-greedy, out of
13758                          * programmer laziness.  Tom Christiansen has agreed
13759                          * that this is ok.  This makes the test for the
13760                          * ligature 'ffi' come before the test for 'ff' */
13761                         if (av_exists(multi_char_matches, cp_count)) {
13762                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13763                                                              cp_count, FALSE);
13764                             this_array = *this_array_ptr;
13765                         }
13766                         else {
13767                             this_array = newAV();
13768                             av_store(multi_char_matches, cp_count,
13769                                      (SV*) this_array);
13770                         }
13771                         av_push(this_array, multi_fold);
13772                     }
13773
13774                     /* This element should not be processed further in this
13775                      * class */
13776                     element_count--;
13777                     value = save_value;
13778                     prevvalue = save_prevvalue;
13779                     continue;
13780                 }
13781             }
13782         }
13783
13784         /* Deal with this element of the class */
13785         if (! SIZE_ONLY) {
13786 #ifndef EBCDIC
13787             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13788 #else
13789             SV* this_range = _new_invlist(1);
13790             _append_range_to_invlist(this_range, prevvalue, value);
13791
13792             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13793              * If this range was specified using something like 'i-j', we want
13794              * to include only the 'i' and the 'j', and not anything in
13795              * between, so exclude non-ASCII, non-alphabetics from it.
13796              * However, if the range was specified with something like
13797              * [\x89-\x91] or [\x89-j], all code points within it should be
13798              * included.  literal_endpoint==2 means both ends of the range used
13799              * a literal character, not \x{foo} */
13800             if (literal_endpoint == 2
13801                 && ((prevvalue >= 'a' && value <= 'z')
13802                     || (prevvalue >= 'A' && value <= 'Z')))
13803             {
13804                 _invlist_intersection(this_range, PL_ASCII,
13805                                       &this_range);
13806                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13807                                       &this_range);
13808             }
13809             _invlist_union(cp_list, this_range, &cp_list);
13810             literal_endpoint = 0;
13811 #endif
13812         }
13813
13814         range = 0; /* this range (if it was one) is done now */
13815     } /* End of loop through all the text within the brackets */
13816
13817     /* If anything in the class expands to more than one character, we have to
13818      * deal with them by building up a substitute parse string, and recursively
13819      * calling reg() on it, instead of proceeding */
13820     if (multi_char_matches) {
13821         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13822         I32 cp_count;
13823         STRLEN len;
13824         char *save_end = RExC_end;
13825         char *save_parse = RExC_parse;
13826         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13827                                        a "|" */
13828         I32 reg_flags;
13829
13830         assert(! invert);
13831 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13832            because too confusing */
13833         if (invert) {
13834             sv_catpv(substitute_parse, "(?:");
13835         }
13836 #endif
13837
13838         /* Look at the longest folds first */
13839         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13840
13841             if (av_exists(multi_char_matches, cp_count)) {
13842                 AV** this_array_ptr;
13843                 SV* this_sequence;
13844
13845                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13846                                                  cp_count, FALSE);
13847                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13848                                                                 &PL_sv_undef)
13849                 {
13850                     if (! first_time) {
13851                         sv_catpv(substitute_parse, "|");
13852                     }
13853                     first_time = FALSE;
13854
13855                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13856                 }
13857             }
13858         }
13859
13860         /* If the character class contains anything else besides these
13861          * multi-character folds, have to include it in recursive parsing */
13862         if (element_count) {
13863             sv_catpv(substitute_parse, "|[");
13864             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13865             sv_catpv(substitute_parse, "]");
13866         }
13867
13868         sv_catpv(substitute_parse, ")");
13869 #if 0
13870         if (invert) {
13871             /* This is a way to get the parse to skip forward a whole named
13872              * sequence instead of matching the 2nd character when it fails the
13873              * first */
13874             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13875         }
13876 #endif
13877
13878         RExC_parse = SvPV(substitute_parse, len);
13879         RExC_end = RExC_parse + len;
13880         RExC_in_multi_char_class = 1;
13881         RExC_emit = (regnode *)orig_emit;
13882
13883         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13884
13885         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13886
13887         RExC_parse = save_parse;
13888         RExC_end = save_end;
13889         RExC_in_multi_char_class = 0;
13890         SvREFCNT_dec_NN(multi_char_matches);
13891         return ret;
13892     }
13893
13894     /* If the character class contains only a single element, it may be
13895      * optimizable into another node type which is smaller and runs faster.
13896      * Check if this is the case for this class */
13897     if ((element_count == 1 && ! ret_invlist)
13898         || UNLIKELY(posixl_matches_all))
13899     {
13900         U8 op = END;
13901         U8 arg = 0;
13902
13903         if (UNLIKELY(posixl_matches_all)) {
13904             op = SANY;
13905         }
13906         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13907                                                    \w or [:digit:] or \p{foo}
13908                                                  */
13909
13910             /* All named classes are mapped into POSIXish nodes, with its FLAG
13911              * argument giving which class it is */
13912             switch ((I32)namedclass) {
13913                 case ANYOF_UNIPROP:
13914                     break;
13915
13916                 /* These don't depend on the charset modifiers.  They always
13917                  * match under /u rules */
13918                 case ANYOF_NHORIZWS:
13919                 case ANYOF_HORIZWS:
13920                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13921                     /* FALLTHROUGH */
13922
13923                 case ANYOF_NVERTWS:
13924                 case ANYOF_VERTWS:
13925                     op = POSIXU;
13926                     goto join_posix;
13927
13928                 /* The actual POSIXish node for all the rest depends on the
13929                  * charset modifier.  The ones in the first set depend only on
13930                  * ASCII or, if available on this platform, locale */
13931                 case ANYOF_ASCII:
13932                 case ANYOF_NASCII:
13933 #ifdef HAS_ISASCII
13934                     op = (LOC) ? POSIXL : POSIXA;
13935 #else
13936                     op = POSIXA;
13937 #endif
13938                     goto join_posix;
13939
13940                 case ANYOF_NCASED:
13941                 case ANYOF_LOWER:
13942                 case ANYOF_NLOWER:
13943                 case ANYOF_UPPER:
13944                 case ANYOF_NUPPER:
13945                     /* under /a could be alpha */
13946                     if (FOLD) {
13947                         if (ASCII_RESTRICTED) {
13948                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13949                         }
13950                         else if (! LOC) {
13951                             break;
13952                         }
13953                     }
13954                     /* FALLTHROUGH */
13955
13956                 /* The rest have more possibilities depending on the charset.
13957                  * We take advantage of the enum ordering of the charset
13958                  * modifiers to get the exact node type, */
13959                 default:
13960                     op = POSIXD + get_regex_charset(RExC_flags);
13961                     if (op > POSIXA) { /* /aa is same as /a */
13962                         op = POSIXA;
13963                     }
13964 #ifndef HAS_ISBLANK
13965                     if (op == POSIXL
13966                         && (namedclass == ANYOF_BLANK
13967                             || namedclass == ANYOF_NBLANK))
13968                     {
13969                         op = POSIXA;
13970                     }
13971 #endif
13972
13973                 join_posix:
13974                     /* The odd numbered ones are the complements of the
13975                      * next-lower even number one */
13976                     if (namedclass % 2 == 1) {
13977                         invert = ! invert;
13978                         namedclass--;
13979                     }
13980                     arg = namedclass_to_classnum(namedclass);
13981                     break;
13982             }
13983         }
13984         else if (value == prevvalue) {
13985
13986             /* Here, the class consists of just a single code point */
13987
13988             if (invert) {
13989                 if (! LOC && value == '\n') {
13990                     op = REG_ANY; /* Optimize [^\n] */
13991                     *flagp |= HASWIDTH|SIMPLE;
13992                     RExC_naughty++;
13993                 }
13994             }
13995             else if (value < 256 || UTF) {
13996
13997                 /* Optimize a single value into an EXACTish node, but not if it
13998                  * would require converting the pattern to UTF-8. */
13999                 op = compute_EXACTish(pRExC_state);
14000             }
14001         } /* Otherwise is a range */
14002         else if (! LOC) {   /* locale could vary these */
14003             if (prevvalue == '0') {
14004                 if (value == '9') {
14005                     arg = _CC_DIGIT;
14006                     op = POSIXA;
14007                 }
14008             }
14009         }
14010
14011         /* Here, we have changed <op> away from its initial value iff we found
14012          * an optimization */
14013         if (op != END) {
14014
14015             /* Throw away this ANYOF regnode, and emit the calculated one,
14016              * which should correspond to the beginning, not current, state of
14017              * the parse */
14018             const char * cur_parse = RExC_parse;
14019             RExC_parse = (char *)orig_parse;
14020             if ( SIZE_ONLY) {
14021                 if (! LOC) {
14022
14023                     /* To get locale nodes to not use the full ANYOF size would
14024                      * require moving the code above that writes the portions
14025                      * of it that aren't in other nodes to after this point.
14026                      * e.g.  ANYOF_POSIXL_SET */
14027                     RExC_size = orig_size;
14028                 }
14029             }
14030             else {
14031                 RExC_emit = (regnode *)orig_emit;
14032                 if (PL_regkind[op] == POSIXD) {
14033                     if (invert) {
14034                         op += NPOSIXD - POSIXD;
14035                     }
14036                 }
14037             }
14038
14039             ret = reg_node(pRExC_state, op);
14040
14041             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14042                 if (! SIZE_ONLY) {
14043                     FLAGS(ret) = arg;
14044                 }
14045                 *flagp |= HASWIDTH|SIMPLE;
14046             }
14047             else if (PL_regkind[op] == EXACT) {
14048                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14049             }
14050
14051             RExC_parse = (char *) cur_parse;
14052
14053             SvREFCNT_dec(posixes);
14054             SvREFCNT_dec(cp_list);
14055             return ret;
14056         }
14057     }
14058
14059     if (SIZE_ONLY)
14060         return ret;
14061     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14062
14063     /* If folding, we calculate all characters that could fold to or from the
14064      * ones already on the list */
14065     if (FOLD && cp_list) {
14066         UV start, end;  /* End points of code point ranges */
14067
14068         SV* fold_intersection = NULL;
14069
14070         /* If the highest code point is within Latin1, we can use the
14071          * compiled-in Alphas list, and not have to go out to disk.  This
14072          * yields two false positives, the masculine and feminine ordinal
14073          * indicators, which are weeded out below using the
14074          * IS_IN_SOME_FOLD_L1() macro */
14075         if (invlist_highest(cp_list) < 256) {
14076             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14077                                                            &fold_intersection);
14078         }
14079         else {
14080
14081             /* Here, there are non-Latin1 code points, so we will have to go
14082              * fetch the list of all the characters that participate in folds
14083              */
14084             if (! PL_utf8_foldable) {
14085                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14086                                        &PL_sv_undef, 1, 0);
14087                 PL_utf8_foldable = _get_swash_invlist(swash);
14088                 SvREFCNT_dec_NN(swash);
14089             }
14090
14091             /* This is a hash that for a particular fold gives all characters
14092              * that are involved in it */
14093             if (! PL_utf8_foldclosures) {
14094
14095                 /* If we were unable to find any folds, then we likely won't be
14096                  * able to find the closures.  So just create an empty list.
14097                  * Folding will effectively be restricted to the non-Unicode
14098                  * rules hard-coded into Perl.  (This case happens legitimately
14099                  * during compilation of Perl itself before the Unicode tables
14100                  * are generated) */
14101                 if (_invlist_len(PL_utf8_foldable) == 0) {
14102                     PL_utf8_foldclosures = newHV();
14103                 }
14104                 else {
14105                     /* If the folds haven't been read in, call a fold function
14106                      * to force that */
14107                     if (! PL_utf8_tofold) {
14108                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14109
14110                         /* This string is just a short named one above \xff */
14111                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14112                         assert(PL_utf8_tofold); /* Verify that worked */
14113                     }
14114                     PL_utf8_foldclosures =
14115                                     _swash_inversion_hash(PL_utf8_tofold);
14116                 }
14117             }
14118
14119             /* Only the characters in this class that participate in folds need
14120              * be checked.  Get the intersection of this class and all the
14121              * possible characters that are foldable.  This can quickly narrow
14122              * down a large class */
14123             _invlist_intersection(PL_utf8_foldable, cp_list,
14124                                   &fold_intersection);
14125         }
14126
14127         /* Now look at the foldable characters in this class individually */
14128         invlist_iterinit(fold_intersection);
14129         while (invlist_iternext(fold_intersection, &start, &end)) {
14130             UV j;
14131
14132             /* Locale folding for Latin1 characters is deferred until runtime */
14133             if (LOC && start < 256) {
14134                 start = 256;
14135             }
14136
14137             /* Look at every character in the range */
14138             for (j = start; j <= end; j++) {
14139
14140                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14141                 STRLEN foldlen;
14142                 SV** listp;
14143
14144                 if (j < 256) {
14145
14146                     /* We have the latin1 folding rules hard-coded here so that
14147                      * an innocent-looking character class, like /[ks]/i won't
14148                      * have to go out to disk to find the possible matches.
14149                      * XXX It would be better to generate these via regen, in
14150                      * case a new version of the Unicode standard adds new
14151                      * mappings, though that is not really likely, and may be
14152                      * caught by the default: case of the switch below. */
14153
14154                     if (IS_IN_SOME_FOLD_L1(j)) {
14155
14156                         /* ASCII is always matched; non-ASCII is matched only
14157                          * under Unicode rules */
14158                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14159                             cp_list =
14160                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14161                         }
14162                         else {
14163                             depends_list =
14164                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14165                         }
14166                     }
14167
14168                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14169                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14170                     {
14171                         /* Certain Latin1 characters have matches outside
14172                          * Latin1.  To get here, <j> is one of those
14173                          * characters.   None of these matches is valid for
14174                          * ASCII characters under /aa, which is why the 'if'
14175                          * just above excludes those.  These matches only
14176                          * happen when the target string is utf8.  The code
14177                          * below adds the single fold closures for <j> to the
14178                          * inversion list. */
14179                         switch (j) {
14180                             case 'k':
14181                             case 'K':
14182                                 cp_list =
14183                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
14184                                 break;
14185                             case 's':
14186                             case 'S':
14187                                 cp_list = add_cp_to_invlist(cp_list,
14188                                                     LATIN_SMALL_LETTER_LONG_S);
14189                                 break;
14190                             case MICRO_SIGN:
14191                                 cp_list = add_cp_to_invlist(cp_list,
14192                                                     GREEK_CAPITAL_LETTER_MU);
14193                                 cp_list = add_cp_to_invlist(cp_list,
14194                                                     GREEK_SMALL_LETTER_MU);
14195                                 break;
14196                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14197                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14198                                 cp_list =
14199                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14200                                 break;
14201                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14202                                 cp_list = add_cp_to_invlist(cp_list,
14203                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14204                                 break;
14205                             case LATIN_SMALL_LETTER_SHARP_S:
14206                                 cp_list = add_cp_to_invlist(cp_list,
14207                                                 LATIN_CAPITAL_LETTER_SHARP_S);
14208                                 break;
14209                             case 'F': case 'f':
14210                             case 'I': case 'i':
14211                             case 'L': case 'l':
14212                             case 'T': case 't':
14213                             case 'A': case 'a':
14214                             case 'H': case 'h':
14215                             case 'J': case 'j':
14216                             case 'N': case 'n':
14217                             case 'W': case 'w':
14218                             case 'Y': case 'y':
14219                                 /* These all are targets of multi-character
14220                                  * folds from code points that require UTF8 to
14221                                  * express, so they can't match unless the
14222                                  * target string is in UTF-8, so no action here
14223                                  * is necessary, as regexec.c properly handles
14224                                  * the general case for UTF-8 matching and
14225                                  * multi-char folds */
14226                                 break;
14227                             default:
14228                                 /* Use deprecated warning to increase the
14229                                  * chances of this being output */
14230                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14231                                 break;
14232                         }
14233                     }
14234                     continue;
14235                 }
14236
14237                 /* Here is an above Latin1 character.  We don't have the rules
14238                  * hard-coded for it.  First, get its fold.  This is the simple
14239                  * fold, as the multi-character folds have been handled earlier
14240                  * and separated out */
14241                 _to_uni_fold_flags(j, foldbuf, &foldlen,
14242                                                ((LOC)
14243                                                ? FOLD_FLAGS_LOCALE
14244                                                : (ASCII_FOLD_RESTRICTED)
14245                                                   ? FOLD_FLAGS_NOMIX_ASCII
14246                                                   : 0));
14247
14248                 /* Single character fold of above Latin1.  Add everything in
14249                  * its fold closure to the list that this node should match.
14250                  * The fold closures data structure is a hash with the keys
14251                  * being the UTF-8 of every character that is folded to, like
14252                  * 'k', and the values each an array of all code points that
14253                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14254                  * Multi-character folds are not included */
14255                 if ((listp = hv_fetch(PL_utf8_foldclosures,
14256                                       (char *) foldbuf, foldlen, FALSE)))
14257                 {
14258                     AV* list = (AV*) *listp;
14259                     IV k;
14260                     for (k = 0; k <= av_len(list); k++) {
14261                         SV** c_p = av_fetch(list, k, FALSE);
14262                         UV c;
14263                         if (c_p == NULL) {
14264                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14265                         }
14266                         c = SvUV(*c_p);
14267
14268                         /* /aa doesn't allow folds between ASCII and non-; /l
14269                          * doesn't allow them between above and below 256 */
14270                         if ((ASCII_FOLD_RESTRICTED
14271                                   && (isASCII(c) != isASCII(j)))
14272                             || (LOC && c < 256)) {
14273                             continue;
14274                         }
14275
14276                         /* Folds involving non-ascii Latin1 characters
14277                          * under /d are added to a separate list */
14278                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14279                         {
14280                             cp_list = add_cp_to_invlist(cp_list, c);
14281                         }
14282                         else {
14283                           depends_list = add_cp_to_invlist(depends_list, c);
14284                         }
14285                     }
14286                 }
14287             }
14288         }
14289         SvREFCNT_dec_NN(fold_intersection);
14290     }
14291
14292     /* And combine the result (if any) with any inversion list from posix
14293      * classes.  The lists are kept separate up to now because we don't want to
14294      * fold the classes (folding of those is automatically handled by the swash
14295      * fetching code) */
14296     if (posixes) {
14297         if (! DEPENDS_SEMANTICS) {
14298             if (cp_list) {
14299                 _invlist_union(cp_list, posixes, &cp_list);
14300                 SvREFCNT_dec_NN(posixes);
14301             }
14302             else {
14303                 cp_list = posixes;
14304             }
14305         }
14306         else {
14307             /* Under /d, we put into a separate list the Latin1 things that
14308              * match only when the target string is utf8 */
14309             SV* nonascii_but_latin1_properties = NULL;
14310             _invlist_intersection(posixes, PL_UpperLatin1,
14311                                   &nonascii_but_latin1_properties);
14312             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14313                               &posixes);
14314             if (cp_list) {
14315                 _invlist_union(cp_list, posixes, &cp_list);
14316                 SvREFCNT_dec_NN(posixes);
14317             }
14318             else {
14319                 cp_list = posixes;
14320             }
14321
14322             if (depends_list) {
14323                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14324                                &depends_list);
14325                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14326             }
14327             else {
14328                 depends_list = nonascii_but_latin1_properties;
14329             }
14330         }
14331     }
14332
14333     /* And combine the result (if any) with any inversion list from properties.
14334      * The lists are kept separate up to now so that we can distinguish the two
14335      * in regards to matching above-Unicode.  A run-time warning is generated
14336      * if a Unicode property is matched against a non-Unicode code point. But,
14337      * we allow user-defined properties to match anything, without any warning,
14338      * and we also suppress the warning if there is a portion of the character
14339      * class that isn't a Unicode property, and which matches above Unicode, \W
14340      * or [\x{110000}] for example.
14341      * (Note that in this case, unlike the Posix one above, there is no
14342      * <depends_list>, because having a Unicode property forces Unicode
14343      * semantics */
14344     if (properties) {
14345         bool warn_super = ! has_user_defined_property;
14346         if (cp_list) {
14347
14348             /* If it matters to the final outcome, see if a non-property
14349              * component of the class matches above Unicode.  If so, the
14350              * warning gets suppressed.  This is true even if just a single
14351              * such code point is specified, as though not strictly correct if
14352              * another such code point is matched against, the fact that they
14353              * are using above-Unicode code points indicates they should know
14354              * the issues involved */
14355             if (warn_super) {
14356                 bool non_prop_matches_above_Unicode =
14357                             runtime_posix_matches_above_Unicode
14358                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14359                 if (invert) {
14360                     non_prop_matches_above_Unicode =
14361                                             !  non_prop_matches_above_Unicode;
14362                 }
14363                 warn_super = ! non_prop_matches_above_Unicode;
14364             }
14365
14366             _invlist_union(properties, cp_list, &cp_list);
14367             SvREFCNT_dec_NN(properties);
14368         }
14369         else {
14370             cp_list = properties;
14371         }
14372
14373         if (warn_super) {
14374             OP(ret) = ANYOF_WARN_SUPER;
14375         }
14376     }
14377
14378     /* Here, we have calculated what code points should be in the character
14379      * class.
14380      *
14381      * Now we can see about various optimizations.  Fold calculation (which we
14382      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14383      * would invert to include K, which under /i would match k, which it
14384      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14385      * folded until runtime */
14386
14387     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14388      * at compile time.  Besides not inverting folded locale now, we can't
14389      * invert if there are things such as \w, which aren't known until runtime
14390      * */
14391     if (invert
14392         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14393         && ! depends_list
14394         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14395     {
14396         _invlist_invert(cp_list);
14397
14398         /* Any swash can't be used as-is, because we've inverted things */
14399         if (swash) {
14400             SvREFCNT_dec_NN(swash);
14401             swash = NULL;
14402         }
14403
14404         /* Clear the invert flag since have just done it here */
14405         invert = FALSE;
14406     }
14407
14408     if (ret_invlist) {
14409         *ret_invlist = cp_list;
14410         SvREFCNT_dec(swash);
14411
14412         /* Discard the generated node */
14413         if (SIZE_ONLY) {
14414             RExC_size = orig_size;
14415         }
14416         else {
14417             RExC_emit = orig_emit;
14418         }
14419         return orig_emit;
14420     }
14421
14422     /* If we didn't do folding, it's because some information isn't available
14423      * until runtime; set the run-time fold flag for these.  (We don't have to
14424      * worry about properties folding, as that is taken care of by the swash
14425      * fetching) */
14426     if (FOLD && LOC)
14427     {
14428        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14429     }
14430
14431     /* Some character classes are equivalent to other nodes.  Such nodes take
14432      * up less room and generally fewer operations to execute than ANYOF nodes.
14433      * Above, we checked for and optimized into some such equivalents for
14434      * certain common classes that are easy to test.  Getting to this point in
14435      * the code means that the class didn't get optimized there.  Since this
14436      * code is only executed in Pass 2, it is too late to save space--it has
14437      * been allocated in Pass 1, and currently isn't given back.  But turning
14438      * things into an EXACTish node can allow the optimizer to join it to any
14439      * adjacent such nodes.  And if the class is equivalent to things like /./,
14440      * expensive run-time swashes can be avoided.  Now that we have more
14441      * complete information, we can find things necessarily missed by the
14442      * earlier code.  I (khw) am not sure how much to look for here.  It would
14443      * be easy, but perhaps too slow, to check any candidates against all the
14444      * node types they could possibly match using _invlistEQ(). */
14445
14446     if (cp_list
14447         && ! invert
14448         && ! depends_list
14449         && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14450         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14451     {
14452         UV start, end;
14453         U8 op = END;  /* The optimzation node-type */
14454         const char * cur_parse= RExC_parse;
14455
14456         invlist_iterinit(cp_list);
14457         if (! invlist_iternext(cp_list, &start, &end)) {
14458
14459             /* Here, the list is empty.  This happens, for example, when a
14460              * Unicode property is the only thing in the character class, and
14461              * it doesn't match anything.  (perluniprops.pod notes such
14462              * properties) */
14463             op = OPFAIL;
14464             *flagp |= HASWIDTH|SIMPLE;
14465         }
14466         else if (start == end) {    /* The range is a single code point */
14467             if (! invlist_iternext(cp_list, &start, &end)
14468
14469                     /* Don't do this optimization if it would require changing
14470                      * the pattern to UTF-8 */
14471                 && (start < 256 || UTF))
14472             {
14473                 /* Here, the list contains a single code point.  Can optimize
14474                  * into an EXACT node */
14475
14476                 value = start;
14477
14478                 if (! FOLD) {
14479                     op = EXACT;
14480                 }
14481                 else if (LOC) {
14482
14483                     /* A locale node under folding with one code point can be
14484                      * an EXACTFL, as its fold won't be calculated until
14485                      * runtime */
14486                     op = EXACTFL;
14487                 }
14488                 else {
14489
14490                     /* Here, we are generally folding, but there is only one
14491                      * code point to match.  If we have to, we use an EXACT
14492                      * node, but it would be better for joining with adjacent
14493                      * nodes in the optimization pass if we used the same
14494                      * EXACTFish node that any such are likely to be.  We can
14495                      * do this iff the code point doesn't participate in any
14496                      * folds.  For example, an EXACTF of a colon is the same as
14497                      * an EXACT one, since nothing folds to or from a colon. */
14498                     if (value < 256) {
14499                         if (IS_IN_SOME_FOLD_L1(value)) {
14500                             op = EXACT;
14501                         }
14502                     }
14503                     else {
14504                         if (! PL_utf8_foldable) {
14505                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14506                                                 &PL_sv_undef, 1, 0);
14507                             PL_utf8_foldable = _get_swash_invlist(swash);
14508                             SvREFCNT_dec_NN(swash);
14509                         }
14510                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14511                             op = EXACT;
14512                         }
14513                     }
14514
14515                     /* If we haven't found the node type, above, it means we
14516                      * can use the prevailing one */
14517                     if (op == END) {
14518                         op = compute_EXACTish(pRExC_state);
14519                     }
14520                 }
14521             }
14522         }
14523         else if (start == 0) {
14524             if (end == UV_MAX) {
14525                 op = SANY;
14526                 *flagp |= HASWIDTH|SIMPLE;
14527                 RExC_naughty++;
14528             }
14529             else if (end == '\n' - 1
14530                     && invlist_iternext(cp_list, &start, &end)
14531                     && start == '\n' + 1 && end == UV_MAX)
14532             {
14533                 op = REG_ANY;
14534                 *flagp |= HASWIDTH|SIMPLE;
14535                 RExC_naughty++;
14536             }
14537         }
14538         invlist_iterfinish(cp_list);
14539
14540         if (op != END) {
14541             RExC_parse = (char *)orig_parse;
14542             RExC_emit = (regnode *)orig_emit;
14543
14544             ret = reg_node(pRExC_state, op);
14545
14546             RExC_parse = (char *)cur_parse;
14547
14548             if (PL_regkind[op] == EXACT) {
14549                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14550             }
14551
14552             SvREFCNT_dec_NN(cp_list);
14553             return ret;
14554         }
14555     }
14556
14557     /* Here, <cp_list> contains all the code points we can determine at
14558      * compile time that match under all conditions.  Go through it, and
14559      * for things that belong in the bitmap, put them there, and delete from
14560      * <cp_list>.  While we are at it, see if everything above 255 is in the
14561      * list, and if so, set a flag to speed up execution */
14562
14563     populate_ANYOF_from_invlist(ret, &cp_list);
14564
14565     if (invert) {
14566         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14567     }
14568
14569     /* Here, the bitmap has been populated with all the Latin1 code points that
14570      * always match.  Can now add to the overall list those that match only
14571      * when the target string is UTF-8 (<depends_list>). */
14572     if (depends_list) {
14573         if (cp_list) {
14574             _invlist_union(cp_list, depends_list, &cp_list);
14575             SvREFCNT_dec_NN(depends_list);
14576         }
14577         else {
14578             cp_list = depends_list;
14579         }
14580     }
14581
14582     /* If there is a swash and more than one element, we can't use the swash in
14583      * the optimization below. */
14584     if (swash && element_count > 1) {
14585         SvREFCNT_dec_NN(swash);
14586         swash = NULL;
14587     }
14588
14589     set_ANYOF_arg(pRExC_state, ret, cp_list,
14590                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14591                    ? listsv : NULL,
14592                   swash, has_user_defined_property);
14593
14594     *flagp |= HASWIDTH|SIMPLE;
14595     return ret;
14596 }
14597
14598 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14599
14600 STATIC void
14601 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14602                 regnode* const node,
14603                 SV* const cp_list,
14604                 SV* const runtime_defns,
14605                 SV* const swash,
14606                 const bool has_user_defined_property)
14607 {
14608     /* Sets the arg field of an ANYOF-type node 'node', using information about
14609      * the node passed-in.  If there is nothing outside the node's bitmap, the
14610      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14611      * the count returned by add_data(), having allocated and stored an array,
14612      * av, that that count references, as follows:
14613      *  av[0] stores the character class description in its textual form.
14614      *        This is used later (regexec.c:Perl_regclass_swash()) to
14615      *        initialize the appropriate swash, and is also useful for dumping
14616      *        the regnode.  This is set to &PL_sv_undef if the textual
14617      *        description is not needed at run-time (as happens if the other
14618      *        elements completely define the class)
14619      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14620      *        computed from av[0].  But if no further computation need be done,
14621      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14622      *  av[2] stores the cp_list inversion list for use in addition or instead
14623      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14624      *        (Otherwise everything needed is already in av[0] and av[1])
14625      *  av[3] is set if any component of the class is from a user-defined
14626      *        property; used only if av[2] exists */
14627
14628     UV n;
14629
14630     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14631
14632     if (! cp_list && ! runtime_defns) {
14633         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14634     }
14635     else {
14636         AV * const av = newAV();
14637         SV *rv;
14638
14639         av_store(av, 0, (runtime_defns)
14640                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14641         if (swash) {
14642             av_store(av, 1, swash);
14643             SvREFCNT_dec_NN(cp_list);
14644         }
14645         else {
14646             av_store(av, 1, &PL_sv_undef);
14647             if (cp_list) {
14648                 av_store(av, 2, cp_list);
14649                 av_store(av, 3, newSVuv(has_user_defined_property));
14650             }
14651         }
14652
14653         rv = newRV_noinc(MUTABLE_SV(av));
14654         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14655         RExC_rxi->data->data[n] = (void*)rv;
14656         ARG_SET(node, n);
14657     }
14658 }
14659
14660
14661 /* reg_skipcomment()
14662
14663    Absorbs an /x style # comments from the input stream.
14664    Returns true if there is more text remaining in the stream.
14665    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14666    terminates the pattern without including a newline.
14667
14668    Note its the callers responsibility to ensure that we are
14669    actually in /x mode
14670
14671 */
14672
14673 STATIC bool
14674 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14675 {
14676     bool ended = 0;
14677
14678     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14679
14680     while (RExC_parse < RExC_end)
14681         if (*RExC_parse++ == '\n') {
14682             ended = 1;
14683             break;
14684         }
14685     if (!ended) {
14686         /* we ran off the end of the pattern without ending
14687            the comment, so we have to add an \n when wrapping */
14688         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14689         return 0;
14690     } else
14691         return 1;
14692 }
14693
14694 /* nextchar()
14695
14696    Advances the parse position, and optionally absorbs
14697    "whitespace" from the inputstream.
14698
14699    Without /x "whitespace" means (?#...) style comments only,
14700    with /x this means (?#...) and # comments and whitespace proper.
14701
14702    Returns the RExC_parse point from BEFORE the scan occurs.
14703
14704    This is the /x friendly way of saying RExC_parse++.
14705 */
14706
14707 STATIC char*
14708 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14709 {
14710     char* const retval = RExC_parse++;
14711
14712     PERL_ARGS_ASSERT_NEXTCHAR;
14713
14714     for (;;) {
14715         if (RExC_end - RExC_parse >= 3
14716             && *RExC_parse == '('
14717             && RExC_parse[1] == '?'
14718             && RExC_parse[2] == '#')
14719         {
14720             while (*RExC_parse != ')') {
14721                 if (RExC_parse == RExC_end)
14722                     FAIL("Sequence (?#... not terminated");
14723                 RExC_parse++;
14724             }
14725             RExC_parse++;
14726             continue;
14727         }
14728         if (RExC_flags & RXf_PMf_EXTENDED) {
14729             if (isSPACE(*RExC_parse)) {
14730                 RExC_parse++;
14731                 continue;
14732             }
14733             else if (*RExC_parse == '#') {
14734                 if ( reg_skipcomment( pRExC_state ) )
14735                     continue;
14736             }
14737         }
14738         return retval;
14739     }
14740 }
14741
14742 /*
14743 - reg_node - emit a node
14744 */
14745 STATIC regnode *                        /* Location. */
14746 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14747 {
14748     dVAR;
14749     regnode *ptr;
14750     regnode * const ret = RExC_emit;
14751     GET_RE_DEBUG_FLAGS_DECL;
14752
14753     PERL_ARGS_ASSERT_REG_NODE;
14754
14755     if (SIZE_ONLY) {
14756         SIZE_ALIGN(RExC_size);
14757         RExC_size += 1;
14758         return(ret);
14759     }
14760     if (RExC_emit >= RExC_emit_bound)
14761         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14762                    op, RExC_emit, RExC_emit_bound);
14763
14764     NODE_ALIGN_FILL(ret);
14765     ptr = ret;
14766     FILL_ADVANCE_NODE(ptr, op);
14767 #ifdef RE_TRACK_PATTERN_OFFSETS
14768     if (RExC_offsets) {         /* MJD */
14769         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14770               "reg_node", __LINE__,
14771               PL_reg_name[op],
14772               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14773                 ? "Overwriting end of array!\n" : "OK",
14774               (UV)(RExC_emit - RExC_emit_start),
14775               (UV)(RExC_parse - RExC_start),
14776               (UV)RExC_offsets[0]));
14777         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14778     }
14779 #endif
14780     RExC_emit = ptr;
14781     return(ret);
14782 }
14783
14784 /*
14785 - reganode - emit a node with an argument
14786 */
14787 STATIC regnode *                        /* Location. */
14788 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14789 {
14790     dVAR;
14791     regnode *ptr;
14792     regnode * const ret = RExC_emit;
14793     GET_RE_DEBUG_FLAGS_DECL;
14794
14795     PERL_ARGS_ASSERT_REGANODE;
14796
14797     if (SIZE_ONLY) {
14798         SIZE_ALIGN(RExC_size);
14799         RExC_size += 2;
14800         /*
14801            We can't do this:
14802
14803            assert(2==regarglen[op]+1);
14804
14805            Anything larger than this has to allocate the extra amount.
14806            If we changed this to be:
14807
14808            RExC_size += (1 + regarglen[op]);
14809
14810            then it wouldn't matter. Its not clear what side effect
14811            might come from that so its not done so far.
14812            -- dmq
14813         */
14814         return(ret);
14815     }
14816     if (RExC_emit >= RExC_emit_bound)
14817         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14818                    op, RExC_emit, RExC_emit_bound);
14819
14820     NODE_ALIGN_FILL(ret);
14821     ptr = ret;
14822     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14823 #ifdef RE_TRACK_PATTERN_OFFSETS
14824     if (RExC_offsets) {         /* MJD */
14825         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14826               "reganode",
14827               __LINE__,
14828               PL_reg_name[op],
14829               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14830               "Overwriting end of array!\n" : "OK",
14831               (UV)(RExC_emit - RExC_emit_start),
14832               (UV)(RExC_parse - RExC_start),
14833               (UV)RExC_offsets[0]));
14834         Set_Cur_Node_Offset;
14835     }
14836 #endif
14837     RExC_emit = ptr;
14838     return(ret);
14839 }
14840
14841 /*
14842 - reguni - emit (if appropriate) a Unicode character
14843 */
14844 STATIC STRLEN
14845 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14846 {
14847     dVAR;
14848
14849     PERL_ARGS_ASSERT_REGUNI;
14850
14851     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14852 }
14853
14854 /*
14855 - reginsert - insert an operator in front of already-emitted operand
14856 *
14857 * Means relocating the operand.
14858 */
14859 STATIC void
14860 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14861 {
14862     dVAR;
14863     regnode *src;
14864     regnode *dst;
14865     regnode *place;
14866     const int offset = regarglen[(U8)op];
14867     const int size = NODE_STEP_REGNODE + offset;
14868     GET_RE_DEBUG_FLAGS_DECL;
14869
14870     PERL_ARGS_ASSERT_REGINSERT;
14871     PERL_UNUSED_ARG(depth);
14872 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14873     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14874     if (SIZE_ONLY) {
14875         RExC_size += size;
14876         return;
14877     }
14878
14879     src = RExC_emit;
14880     RExC_emit += size;
14881     dst = RExC_emit;
14882     if (RExC_open_parens) {
14883         int paren;
14884         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14885         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14886             if ( RExC_open_parens[paren] >= opnd ) {
14887                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14888                 RExC_open_parens[paren] += size;
14889             } else {
14890                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14891             }
14892             if ( RExC_close_parens[paren] >= opnd ) {
14893                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14894                 RExC_close_parens[paren] += size;
14895             } else {
14896                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14897             }
14898         }
14899     }
14900
14901     while (src > opnd) {
14902         StructCopy(--src, --dst, regnode);
14903 #ifdef RE_TRACK_PATTERN_OFFSETS
14904         if (RExC_offsets) {     /* MJD 20010112 */
14905             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14906                   "reg_insert",
14907                   __LINE__,
14908                   PL_reg_name[op],
14909                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14910                     ? "Overwriting end of array!\n" : "OK",
14911                   (UV)(src - RExC_emit_start),
14912                   (UV)(dst - RExC_emit_start),
14913                   (UV)RExC_offsets[0]));
14914             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14915             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14916         }
14917 #endif
14918     }
14919
14920
14921     place = opnd;               /* Op node, where operand used to be. */
14922 #ifdef RE_TRACK_PATTERN_OFFSETS
14923     if (RExC_offsets) {         /* MJD */
14924         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14925               "reginsert",
14926               __LINE__,
14927               PL_reg_name[op],
14928               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14929               ? "Overwriting end of array!\n" : "OK",
14930               (UV)(place - RExC_emit_start),
14931               (UV)(RExC_parse - RExC_start),
14932               (UV)RExC_offsets[0]));
14933         Set_Node_Offset(place, RExC_parse);
14934         Set_Node_Length(place, 1);
14935     }
14936 #endif
14937     src = NEXTOPER(place);
14938     FILL_ADVANCE_NODE(place, op);
14939     Zero(src, offset, regnode);
14940 }
14941
14942 /*
14943 - regtail - set the next-pointer at the end of a node chain of p to val.
14944 - SEE ALSO: regtail_study
14945 */
14946 /* TODO: All three parms should be const */
14947 STATIC void
14948 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14949 {
14950     dVAR;
14951     regnode *scan;
14952     GET_RE_DEBUG_FLAGS_DECL;
14953
14954     PERL_ARGS_ASSERT_REGTAIL;
14955 #ifndef DEBUGGING
14956     PERL_UNUSED_ARG(depth);
14957 #endif
14958
14959     if (SIZE_ONLY)
14960         return;
14961
14962     /* Find last node. */
14963     scan = p;
14964     for (;;) {
14965         regnode * const temp = regnext(scan);
14966         DEBUG_PARSE_r({
14967             SV * const mysv=sv_newmortal();
14968             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14969             regprop(RExC_rx, mysv, scan);
14970             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14971                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14972                     (temp == NULL ? "->" : ""),
14973                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14974             );
14975         });
14976         if (temp == NULL)
14977             break;
14978         scan = temp;
14979     }
14980
14981     if (reg_off_by_arg[OP(scan)]) {
14982         ARG_SET(scan, val - scan);
14983     }
14984     else {
14985         NEXT_OFF(scan) = val - scan;
14986     }
14987 }
14988
14989 #ifdef DEBUGGING
14990 /*
14991 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14992 - Look for optimizable sequences at the same time.
14993 - currently only looks for EXACT chains.
14994
14995 This is experimental code. The idea is to use this routine to perform
14996 in place optimizations on branches and groups as they are constructed,
14997 with the long term intention of removing optimization from study_chunk so
14998 that it is purely analytical.
14999
15000 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15001 to control which is which.
15002
15003 */
15004 /* TODO: All four parms should be const */
15005
15006 STATIC U8
15007 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15008 {
15009     dVAR;
15010     regnode *scan;
15011     U8 exact = PSEUDO;
15012 #ifdef EXPERIMENTAL_INPLACESCAN
15013     I32 min = 0;
15014 #endif
15015     GET_RE_DEBUG_FLAGS_DECL;
15016
15017     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15018
15019
15020     if (SIZE_ONLY)
15021         return exact;
15022
15023     /* Find last node. */
15024
15025     scan = p;
15026     for (;;) {
15027         regnode * const temp = regnext(scan);
15028 #ifdef EXPERIMENTAL_INPLACESCAN
15029         if (PL_regkind[OP(scan)] == EXACT) {
15030             bool has_exactf_sharp_s;    /* Unexamined in this routine */
15031             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15032                 return EXACT;
15033         }
15034 #endif
15035         if ( exact ) {
15036             switch (OP(scan)) {
15037                 case EXACT:
15038                 case EXACTF:
15039                 case EXACTFA_NO_TRIE:
15040                 case EXACTFA:
15041                 case EXACTFU:
15042                 case EXACTFU_SS:
15043                 case EXACTFL:
15044                         if( exact == PSEUDO )
15045                             exact= OP(scan);
15046                         else if ( exact != OP(scan) )
15047                             exact= 0;
15048                 case NOTHING:
15049                     break;
15050                 default:
15051                     exact= 0;
15052             }
15053         }
15054         DEBUG_PARSE_r({
15055             SV * const mysv=sv_newmortal();
15056             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15057             regprop(RExC_rx, mysv, scan);
15058             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15059                 SvPV_nolen_const(mysv),
15060                 REG_NODE_NUM(scan),
15061                 PL_reg_name[exact]);
15062         });
15063         if (temp == NULL)
15064             break;
15065         scan = temp;
15066     }
15067     DEBUG_PARSE_r({
15068         SV * const mysv_val=sv_newmortal();
15069         DEBUG_PARSE_MSG("");
15070         regprop(RExC_rx, mysv_val, val);
15071         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15072                       SvPV_nolen_const(mysv_val),
15073                       (IV)REG_NODE_NUM(val),
15074                       (IV)(val - scan)
15075         );
15076     });
15077     if (reg_off_by_arg[OP(scan)]) {
15078         ARG_SET(scan, val - scan);
15079     }
15080     else {
15081         NEXT_OFF(scan) = val - scan;
15082     }
15083
15084     return exact;
15085 }
15086 #endif
15087
15088 /*
15089  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15090  */
15091 #ifdef DEBUGGING
15092
15093 static void
15094 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15095 {
15096     int bit;
15097     int set=0;
15098
15099     for (bit=0; bit<32; bit++) {
15100         if (flags & (1<<bit)) {
15101             if (!set++ && lead)
15102                 PerlIO_printf(Perl_debug_log, "%s",lead);
15103             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15104         }
15105     }
15106     if (lead)  {
15107         if (set)
15108             PerlIO_printf(Perl_debug_log, "\n");
15109         else
15110             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15111     }
15112 }
15113
15114 static void
15115 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15116 {
15117     int bit;
15118     int set=0;
15119     regex_charset cs;
15120
15121     for (bit=0; bit<32; bit++) {
15122         if (flags & (1<<bit)) {
15123             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15124                 continue;
15125             }
15126             if (!set++ && lead)
15127                 PerlIO_printf(Perl_debug_log, "%s",lead);
15128             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15129         }
15130     }
15131     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15132             if (!set++ && lead) {
15133                 PerlIO_printf(Perl_debug_log, "%s",lead);
15134             }
15135             switch (cs) {
15136                 case REGEX_UNICODE_CHARSET:
15137                     PerlIO_printf(Perl_debug_log, "UNICODE");
15138                     break;
15139                 case REGEX_LOCALE_CHARSET:
15140                     PerlIO_printf(Perl_debug_log, "LOCALE");
15141                     break;
15142                 case REGEX_ASCII_RESTRICTED_CHARSET:
15143                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15144                     break;
15145                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15146                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15147                     break;
15148                 default:
15149                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15150                     break;
15151             }
15152     }
15153     if (lead)  {
15154         if (set)
15155             PerlIO_printf(Perl_debug_log, "\n");
15156         else
15157             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15158     }
15159 }
15160 #endif
15161
15162 void
15163 Perl_regdump(pTHX_ const regexp *r)
15164 {
15165 #ifdef DEBUGGING
15166     dVAR;
15167     SV * const sv = sv_newmortal();
15168     SV *dsv= sv_newmortal();
15169     RXi_GET_DECL(r,ri);
15170     GET_RE_DEBUG_FLAGS_DECL;
15171
15172     PERL_ARGS_ASSERT_REGDUMP;
15173
15174     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15175
15176     /* Header fields of interest. */
15177     if (r->anchored_substr) {
15178         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15179             RE_SV_DUMPLEN(r->anchored_substr), 30);
15180         PerlIO_printf(Perl_debug_log,
15181                       "anchored %s%s at %"IVdf" ",
15182                       s, RE_SV_TAIL(r->anchored_substr),
15183                       (IV)r->anchored_offset);
15184     } else if (r->anchored_utf8) {
15185         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15186             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15187         PerlIO_printf(Perl_debug_log,
15188                       "anchored utf8 %s%s at %"IVdf" ",
15189                       s, RE_SV_TAIL(r->anchored_utf8),
15190                       (IV)r->anchored_offset);
15191     }
15192     if (r->float_substr) {
15193         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15194             RE_SV_DUMPLEN(r->float_substr), 30);
15195         PerlIO_printf(Perl_debug_log,
15196                       "floating %s%s at %"IVdf"..%"UVuf" ",
15197                       s, RE_SV_TAIL(r->float_substr),
15198                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15199     } else if (r->float_utf8) {
15200         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15201             RE_SV_DUMPLEN(r->float_utf8), 30);
15202         PerlIO_printf(Perl_debug_log,
15203                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15204                       s, RE_SV_TAIL(r->float_utf8),
15205                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15206     }
15207     if (r->check_substr || r->check_utf8)
15208         PerlIO_printf(Perl_debug_log,
15209                       (const char *)
15210                       (r->check_substr == r->float_substr
15211                        && r->check_utf8 == r->float_utf8
15212                        ? "(checking floating" : "(checking anchored"));
15213     if (r->extflags & RXf_NOSCAN)
15214         PerlIO_printf(Perl_debug_log, " noscan");
15215     if (r->extflags & RXf_CHECK_ALL)
15216         PerlIO_printf(Perl_debug_log, " isall");
15217     if (r->check_substr || r->check_utf8)
15218         PerlIO_printf(Perl_debug_log, ") ");
15219
15220     if (ri->regstclass) {
15221         regprop(r, sv, ri->regstclass);
15222         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15223     }
15224     if (r->extflags & RXf_ANCH) {
15225         PerlIO_printf(Perl_debug_log, "anchored");
15226         if (r->extflags & RXf_ANCH_BOL)
15227             PerlIO_printf(Perl_debug_log, "(BOL)");
15228         if (r->extflags & RXf_ANCH_MBOL)
15229             PerlIO_printf(Perl_debug_log, "(MBOL)");
15230         if (r->extflags & RXf_ANCH_SBOL)
15231             PerlIO_printf(Perl_debug_log, "(SBOL)");
15232         if (r->extflags & RXf_ANCH_GPOS)
15233             PerlIO_printf(Perl_debug_log, "(GPOS)");
15234         PerlIO_putc(Perl_debug_log, ' ');
15235     }
15236     if (r->extflags & RXf_GPOS_SEEN)
15237         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15238     if (r->intflags & PREGf_SKIP)
15239         PerlIO_printf(Perl_debug_log, "plus ");
15240     if (r->intflags & PREGf_IMPLICIT)
15241         PerlIO_printf(Perl_debug_log, "implicit ");
15242     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15243     if (r->extflags & RXf_EVAL_SEEN)
15244         PerlIO_printf(Perl_debug_log, "with eval ");
15245     PerlIO_printf(Perl_debug_log, "\n");
15246     DEBUG_FLAGS_r({
15247         regdump_extflags("r->extflags: ",r->extflags);
15248         regdump_intflags("r->intflags: ",r->intflags);
15249     });
15250 #else
15251     PERL_ARGS_ASSERT_REGDUMP;
15252     PERL_UNUSED_CONTEXT;
15253     PERL_UNUSED_ARG(r);
15254 #endif  /* DEBUGGING */
15255 }
15256
15257 /*
15258 - regprop - printable representation of opcode
15259 */
15260
15261 void
15262 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15263 {
15264 #ifdef DEBUGGING
15265     dVAR;
15266     int k;
15267
15268     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15269     static const char * const anyofs[] = {
15270 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15271     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15272     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15273     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15274     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15275     || _CC_VERTSPACE != 16
15276   #error Need to adjust order of anyofs[]
15277 #endif
15278         "\\w",
15279         "\\W",
15280         "\\d",
15281         "\\D",
15282         "[:alpha:]",
15283         "[:^alpha:]",
15284         "[:lower:]",
15285         "[:^lower:]",
15286         "[:upper:]",
15287         "[:^upper:]",
15288         "[:punct:]",
15289         "[:^punct:]",
15290         "[:print:]",
15291         "[:^print:]",
15292         "[:alnum:]",
15293         "[:^alnum:]",
15294         "[:graph:]",
15295         "[:^graph:]",
15296         "[:cased:]",
15297         "[:^cased:]",
15298         "\\s",
15299         "\\S",
15300         "[:blank:]",
15301         "[:^blank:]",
15302         "[:xdigit:]",
15303         "[:^xdigit:]",
15304         "[:space:]",
15305         "[:^space:]",
15306         "[:cntrl:]",
15307         "[:^cntrl:]",
15308         "[:ascii:]",
15309         "[:^ascii:]",
15310         "\\v",
15311         "\\V"
15312     };
15313     RXi_GET_DECL(prog,progi);
15314     GET_RE_DEBUG_FLAGS_DECL;
15315
15316     PERL_ARGS_ASSERT_REGPROP;
15317
15318     sv_setpvs(sv, "");
15319
15320     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15321         /* It would be nice to FAIL() here, but this may be called from
15322            regexec.c, and it would be hard to supply pRExC_state. */
15323         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15324     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15325
15326     k = PL_regkind[OP(o)];
15327
15328     if (k == EXACT) {
15329         sv_catpvs(sv, " ");
15330         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15331          * is a crude hack but it may be the best for now since
15332          * we have no flag "this EXACTish node was UTF-8"
15333          * --jhi */
15334         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15335                   PERL_PV_ESCAPE_UNI_DETECT |
15336                   PERL_PV_ESCAPE_NONASCII   |
15337                   PERL_PV_PRETTY_ELLIPSES   |
15338                   PERL_PV_PRETTY_LTGT       |
15339                   PERL_PV_PRETTY_NOCLEAR
15340                   );
15341     } else if (k == TRIE) {
15342         /* print the details of the trie in dumpuntil instead, as
15343          * progi->data isn't available here */
15344         const char op = OP(o);
15345         const U32 n = ARG(o);
15346         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15347                (reg_ac_data *)progi->data->data[n] :
15348                NULL;
15349         const reg_trie_data * const trie
15350             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15351
15352         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15353         DEBUG_TRIE_COMPILE_r(
15354             Perl_sv_catpvf(aTHX_ sv,
15355                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15356                 (UV)trie->startstate,
15357                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15358                 (UV)trie->wordcount,
15359                 (UV)trie->minlen,
15360                 (UV)trie->maxlen,
15361                 (UV)TRIE_CHARCOUNT(trie),
15362                 (UV)trie->uniquecharcount
15363             )
15364         );
15365         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15366             sv_catpvs(sv, "[");
15367             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15368                                                    ? ANYOF_BITMAP(o)
15369                                                    : TRIE_BITMAP(trie));
15370             sv_catpvs(sv, "]");
15371         }
15372
15373     } else if (k == CURLY) {
15374         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15375             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15376         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15377     }
15378     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15379         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15380     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15381         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15382         if ( RXp_PAREN_NAMES(prog) ) {
15383             if ( k != REF || (OP(o) < NREF)) {
15384                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15385                 SV **name= av_fetch(list, ARG(o), 0 );
15386                 if (name)
15387                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15388             }
15389             else {
15390                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15391                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15392                 I32 *nums=(I32*)SvPVX(sv_dat);
15393                 SV **name= av_fetch(list, nums[0], 0 );
15394                 I32 n;
15395                 if (name) {
15396                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15397                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15398                                     (n ? "," : ""), (IV)nums[n]);
15399                     }
15400                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15401                 }
15402             }
15403         }
15404     } else if (k == GOSUB)
15405         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15406     else if (k == VERB) {
15407         if (!o->flags)
15408             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15409                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15410     } else if (k == LOGICAL)
15411         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
15412     else if (k == ANYOF) {
15413         const U8 flags = ANYOF_FLAGS(o);
15414         int do_sep = 0;
15415
15416
15417         if (flags & ANYOF_LOCALE)
15418             sv_catpvs(sv, "{loc}");
15419         if (flags & ANYOF_LOC_FOLD)
15420             sv_catpvs(sv, "{i}");
15421         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15422         if (flags & ANYOF_INVERT)
15423             sv_catpvs(sv, "^");
15424
15425         /* output what the standard cp 0-255 bitmap matches */
15426         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15427
15428         /* output any special charclass tests (used entirely under use
15429          * locale) * */
15430         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15431             int i;
15432             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15433                 if (ANYOF_POSIXL_TEST(o,i)) {
15434                     sv_catpv(sv, anyofs[i]);
15435                     do_sep = 1;
15436                 }
15437             }
15438         }
15439
15440         if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15441             || ANYOF_NONBITMAP(o))
15442         {
15443             if (do_sep) {
15444                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15445                 if (flags & ANYOF_INVERT)
15446                     /*make sure the invert info is in each */
15447                     sv_catpvs(sv, "^");
15448             }
15449
15450         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15451             sv_catpvs(sv, "{non-utf8-latin1-all}");
15452         }
15453
15454         /* output information about the unicode matching */
15455         if (flags & ANYOF_ABOVE_LATIN1_ALL)
15456             sv_catpvs(sv, "{unicode_all}");
15457         else if (ANYOF_NONBITMAP(o)) {
15458             SV *lv; /* Set if there is something outside the bit map. */
15459             bool byte_output = FALSE;   /* If something in the bitmap has been
15460                                            output */
15461
15462             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15463                 sv_catpvs(sv, "{outside bitmap}");
15464             }
15465             else {
15466                 sv_catpvs(sv, "{utf8}");
15467             }
15468
15469             /* Get the stuff that wasn't in the bitmap */
15470             (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15471             if (lv && lv != &PL_sv_undef) {
15472                 char *s = savesvpv(lv);
15473                 char * const origs = s;
15474
15475                 while (*s && *s != '\n')
15476                     s++;
15477
15478                 if (*s == '\n') {
15479                     const char * const t = ++s;
15480
15481                     if (byte_output) {
15482                         sv_catpvs(sv, " ");
15483                     }
15484
15485                     while (*s) {
15486                         if (*s == '\n') {
15487
15488                             /* Truncate very long output */
15489                             if (s - origs > 256) {
15490                                 Perl_sv_catpvf(aTHX_ sv,
15491                                                "%.*s...",
15492                                                (int) (s - origs - 1),
15493                                                t);
15494                                 goto out_dump;
15495                             }
15496                             *s = ' ';
15497                         }
15498                         else if (*s == '\t') {
15499                             *s = '-';
15500                         }
15501                         s++;
15502                     }
15503                     if (s[-1] == ' ')
15504                         s[-1] = 0;
15505
15506                     sv_catpv(sv, t);
15507                 }
15508
15509             out_dump:
15510
15511                 Safefree(origs);
15512                 SvREFCNT_dec_NN(lv);
15513             }
15514         }
15515         }
15516
15517         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15518     }
15519     else if (k == POSIXD || k == NPOSIXD) {
15520         U8 index = FLAGS(o) * 2;
15521         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15522             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15523         }
15524         else {
15525             if (*anyofs[index] != '[')  {
15526                 sv_catpv(sv, "[");
15527             }
15528             sv_catpv(sv, anyofs[index]);
15529             if (*anyofs[index] != '[')  {
15530                 sv_catpv(sv, "]");
15531             }
15532         }
15533     }
15534     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15535         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15536 #else
15537     PERL_UNUSED_CONTEXT;
15538     PERL_UNUSED_ARG(sv);
15539     PERL_UNUSED_ARG(o);
15540     PERL_UNUSED_ARG(prog);
15541 #endif  /* DEBUGGING */
15542 }
15543
15544 SV *
15545 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15546 {                               /* Assume that RE_INTUIT is set */
15547     dVAR;
15548     struct regexp *const prog = ReANY(r);
15549     GET_RE_DEBUG_FLAGS_DECL;
15550
15551     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15552     PERL_UNUSED_CONTEXT;
15553
15554     DEBUG_COMPILE_r(
15555         {
15556             const char * const s = SvPV_nolen_const(prog->check_substr
15557                       ? prog->check_substr : prog->check_utf8);
15558
15559             if (!PL_colorset) reginitcolors();
15560             PerlIO_printf(Perl_debug_log,
15561                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15562                       PL_colors[4],
15563                       prog->check_substr ? "" : "utf8 ",
15564                       PL_colors[5],PL_colors[0],
15565                       s,
15566                       PL_colors[1],
15567                       (strlen(s) > 60 ? "..." : ""));
15568         } );
15569
15570     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15571 }
15572
15573 /*
15574    pregfree()
15575
15576    handles refcounting and freeing the perl core regexp structure. When
15577    it is necessary to actually free the structure the first thing it
15578    does is call the 'free' method of the regexp_engine associated to
15579    the regexp, allowing the handling of the void *pprivate; member
15580    first. (This routine is not overridable by extensions, which is why
15581    the extensions free is called first.)
15582
15583    See regdupe and regdupe_internal if you change anything here.
15584 */
15585 #ifndef PERL_IN_XSUB_RE
15586 void
15587 Perl_pregfree(pTHX_ REGEXP *r)
15588 {
15589     SvREFCNT_dec(r);
15590 }
15591
15592 void
15593 Perl_pregfree2(pTHX_ REGEXP *rx)
15594 {
15595     dVAR;
15596     struct regexp *const r = ReANY(rx);
15597     GET_RE_DEBUG_FLAGS_DECL;
15598
15599     PERL_ARGS_ASSERT_PREGFREE2;
15600
15601     if (r->mother_re) {
15602         ReREFCNT_dec(r->mother_re);
15603     } else {
15604         CALLREGFREE_PVT(rx); /* free the private data */
15605         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15606         Safefree(r->xpv_len_u.xpvlenu_pv);
15607     }
15608     if (r->substrs) {
15609         SvREFCNT_dec(r->anchored_substr);
15610         SvREFCNT_dec(r->anchored_utf8);
15611         SvREFCNT_dec(r->float_substr);
15612         SvREFCNT_dec(r->float_utf8);
15613         Safefree(r->substrs);
15614     }
15615     RX_MATCH_COPY_FREE(rx);
15616 #ifdef PERL_ANY_COW
15617     SvREFCNT_dec(r->saved_copy);
15618 #endif
15619     Safefree(r->offs);
15620     SvREFCNT_dec(r->qr_anoncv);
15621     rx->sv_u.svu_rx = 0;
15622 }
15623
15624 /*  reg_temp_copy()
15625
15626     This is a hacky workaround to the structural issue of match results
15627     being stored in the regexp structure which is in turn stored in
15628     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15629     could be PL_curpm in multiple contexts, and could require multiple
15630     result sets being associated with the pattern simultaneously, such
15631     as when doing a recursive match with (??{$qr})
15632
15633     The solution is to make a lightweight copy of the regexp structure
15634     when a qr// is returned from the code executed by (??{$qr}) this
15635     lightweight copy doesn't actually own any of its data except for
15636     the starp/end and the actual regexp structure itself.
15637
15638 */
15639
15640
15641 REGEXP *
15642 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15643 {
15644     struct regexp *ret;
15645     struct regexp *const r = ReANY(rx);
15646     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15647
15648     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15649
15650     if (!ret_x)
15651         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15652     else {
15653         SvOK_off((SV *)ret_x);
15654         if (islv) {
15655             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15656                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15657                made both spots point to the same regexp body.) */
15658             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15659             assert(!SvPVX(ret_x));
15660             ret_x->sv_u.svu_rx = temp->sv_any;
15661             temp->sv_any = NULL;
15662             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15663             SvREFCNT_dec_NN(temp);
15664             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15665                ing below will not set it. */
15666             SvCUR_set(ret_x, SvCUR(rx));
15667         }
15668     }
15669     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15670        sv_force_normal(sv) is called.  */
15671     SvFAKE_on(ret_x);
15672     ret = ReANY(ret_x);
15673
15674     SvFLAGS(ret_x) |= SvUTF8(rx);
15675     /* We share the same string buffer as the original regexp, on which we
15676        hold a reference count, incremented when mother_re is set below.
15677        The string pointer is copied here, being part of the regexp struct.
15678      */
15679     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15680            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15681     if (r->offs) {
15682         const I32 npar = r->nparens+1;
15683         Newx(ret->offs, npar, regexp_paren_pair);
15684         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15685     }
15686     if (r->substrs) {
15687         Newx(ret->substrs, 1, struct reg_substr_data);
15688         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15689
15690         SvREFCNT_inc_void(ret->anchored_substr);
15691         SvREFCNT_inc_void(ret->anchored_utf8);
15692         SvREFCNT_inc_void(ret->float_substr);
15693         SvREFCNT_inc_void(ret->float_utf8);
15694
15695         /* check_substr and check_utf8, if non-NULL, point to either their
15696            anchored or float namesakes, and don't hold a second reference.  */
15697     }
15698     RX_MATCH_COPIED_off(ret_x);
15699 #ifdef PERL_ANY_COW
15700     ret->saved_copy = NULL;
15701 #endif
15702     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15703     SvREFCNT_inc_void(ret->qr_anoncv);
15704
15705     return ret_x;
15706 }
15707 #endif
15708
15709 /* regfree_internal()
15710
15711    Free the private data in a regexp. This is overloadable by
15712    extensions. Perl takes care of the regexp structure in pregfree(),
15713    this covers the *pprivate pointer which technically perl doesn't
15714    know about, however of course we have to handle the
15715    regexp_internal structure when no extension is in use.
15716
15717    Note this is called before freeing anything in the regexp
15718    structure.
15719  */
15720
15721 void
15722 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15723 {
15724     dVAR;
15725     struct regexp *const r = ReANY(rx);
15726     RXi_GET_DECL(r,ri);
15727     GET_RE_DEBUG_FLAGS_DECL;
15728
15729     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15730
15731     DEBUG_COMPILE_r({
15732         if (!PL_colorset)
15733             reginitcolors();
15734         {
15735             SV *dsv= sv_newmortal();
15736             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15737                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15738             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15739                 PL_colors[4],PL_colors[5],s);
15740         }
15741     });
15742 #ifdef RE_TRACK_PATTERN_OFFSETS
15743     if (ri->u.offsets)
15744         Safefree(ri->u.offsets);             /* 20010421 MJD */
15745 #endif
15746     if (ri->code_blocks) {
15747         int n;
15748         for (n = 0; n < ri->num_code_blocks; n++)
15749             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15750         Safefree(ri->code_blocks);
15751     }
15752
15753     if (ri->data) {
15754         int n = ri->data->count;
15755
15756         while (--n >= 0) {
15757           /* If you add a ->what type here, update the comment in regcomp.h */
15758             switch (ri->data->what[n]) {
15759             case 'a':
15760             case 'r':
15761             case 's':
15762             case 'S':
15763             case 'u':
15764                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15765                 break;
15766             case 'f':
15767                 Safefree(ri->data->data[n]);
15768                 break;
15769             case 'l':
15770             case 'L':
15771                 break;
15772             case 'T':
15773                 { /* Aho Corasick add-on structure for a trie node.
15774                      Used in stclass optimization only */
15775                     U32 refcount;
15776                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15777                     OP_REFCNT_LOCK;
15778                     refcount = --aho->refcount;
15779                     OP_REFCNT_UNLOCK;
15780                     if ( !refcount ) {
15781                         PerlMemShared_free(aho->states);
15782                         PerlMemShared_free(aho->fail);
15783                          /* do this last!!!! */
15784                         PerlMemShared_free(ri->data->data[n]);
15785                         PerlMemShared_free(ri->regstclass);
15786                     }
15787                 }
15788                 break;
15789             case 't':
15790                 {
15791                     /* trie structure. */
15792                     U32 refcount;
15793                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15794                     OP_REFCNT_LOCK;
15795                     refcount = --trie->refcount;
15796                     OP_REFCNT_UNLOCK;
15797                     if ( !refcount ) {
15798                         PerlMemShared_free(trie->charmap);
15799                         PerlMemShared_free(trie->states);
15800                         PerlMemShared_free(trie->trans);
15801                         if (trie->bitmap)
15802                             PerlMemShared_free(trie->bitmap);
15803                         if (trie->jump)
15804                             PerlMemShared_free(trie->jump);
15805                         PerlMemShared_free(trie->wordinfo);
15806                         /* do this last!!!! */
15807                         PerlMemShared_free(ri->data->data[n]);
15808                     }
15809                 }
15810                 break;
15811             default:
15812                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15813             }
15814         }
15815         Safefree(ri->data->what);
15816         Safefree(ri->data);
15817     }
15818
15819     Safefree(ri);
15820 }
15821
15822 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15823 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15824 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15825
15826 /*
15827    re_dup - duplicate a regexp.
15828
15829    This routine is expected to clone a given regexp structure. It is only
15830    compiled under USE_ITHREADS.
15831
15832    After all of the core data stored in struct regexp is duplicated
15833    the regexp_engine.dupe method is used to copy any private data
15834    stored in the *pprivate pointer. This allows extensions to handle
15835    any duplication it needs to do.
15836
15837    See pregfree() and regfree_internal() if you change anything here.
15838 */
15839 #if defined(USE_ITHREADS)
15840 #ifndef PERL_IN_XSUB_RE
15841 void
15842 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15843 {
15844     dVAR;
15845     I32 npar;
15846     const struct regexp *r = ReANY(sstr);
15847     struct regexp *ret = ReANY(dstr);
15848
15849     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15850
15851     npar = r->nparens+1;
15852     Newx(ret->offs, npar, regexp_paren_pair);
15853     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15854
15855     if (ret->substrs) {
15856         /* Do it this way to avoid reading from *r after the StructCopy().
15857            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15858            cache, it doesn't matter.  */
15859         const bool anchored = r->check_substr
15860             ? r->check_substr == r->anchored_substr
15861             : r->check_utf8 == r->anchored_utf8;
15862         Newx(ret->substrs, 1, struct reg_substr_data);
15863         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15864
15865         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15866         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15867         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15868         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15869
15870         /* check_substr and check_utf8, if non-NULL, point to either their
15871            anchored or float namesakes, and don't hold a second reference.  */
15872
15873         if (ret->check_substr) {
15874             if (anchored) {
15875                 assert(r->check_utf8 == r->anchored_utf8);
15876                 ret->check_substr = ret->anchored_substr;
15877                 ret->check_utf8 = ret->anchored_utf8;
15878             } else {
15879                 assert(r->check_substr == r->float_substr);
15880                 assert(r->check_utf8 == r->float_utf8);
15881                 ret->check_substr = ret->float_substr;
15882                 ret->check_utf8 = ret->float_utf8;
15883             }
15884         } else if (ret->check_utf8) {
15885             if (anchored) {
15886                 ret->check_utf8 = ret->anchored_utf8;
15887             } else {
15888                 ret->check_utf8 = ret->float_utf8;
15889             }
15890         }
15891     }
15892
15893     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15894     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15895
15896     if (ret->pprivate)
15897         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15898
15899     if (RX_MATCH_COPIED(dstr))
15900         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15901     else
15902         ret->subbeg = NULL;
15903 #ifdef PERL_ANY_COW
15904     ret->saved_copy = NULL;
15905 #endif
15906
15907     /* Whether mother_re be set or no, we need to copy the string.  We
15908        cannot refrain from copying it when the storage points directly to
15909        our mother regexp, because that's
15910                1: a buffer in a different thread
15911                2: something we no longer hold a reference on
15912                so we need to copy it locally.  */
15913     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15914     ret->mother_re   = NULL;
15915 }
15916 #endif /* PERL_IN_XSUB_RE */
15917
15918 /*
15919    regdupe_internal()
15920
15921    This is the internal complement to regdupe() which is used to copy
15922    the structure pointed to by the *pprivate pointer in the regexp.
15923    This is the core version of the extension overridable cloning hook.
15924    The regexp structure being duplicated will be copied by perl prior
15925    to this and will be provided as the regexp *r argument, however
15926    with the /old/ structures pprivate pointer value. Thus this routine
15927    may override any copying normally done by perl.
15928
15929    It returns a pointer to the new regexp_internal structure.
15930 */
15931
15932 void *
15933 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15934 {
15935     dVAR;
15936     struct regexp *const r = ReANY(rx);
15937     regexp_internal *reti;
15938     int len;
15939     RXi_GET_DECL(r,ri);
15940
15941     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15942
15943     len = ProgLen(ri);
15944
15945     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15946     Copy(ri->program, reti->program, len+1, regnode);
15947
15948     reti->num_code_blocks = ri->num_code_blocks;
15949     if (ri->code_blocks) {
15950         int n;
15951         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15952                 struct reg_code_block);
15953         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15954                 struct reg_code_block);
15955         for (n = 0; n < ri->num_code_blocks; n++)
15956              reti->code_blocks[n].src_regex = (REGEXP*)
15957                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15958     }
15959     else
15960         reti->code_blocks = NULL;
15961
15962     reti->regstclass = NULL;
15963
15964     if (ri->data) {
15965         struct reg_data *d;
15966         const int count = ri->data->count;
15967         int i;
15968
15969         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15970                 char, struct reg_data);
15971         Newx(d->what, count, U8);
15972
15973         d->count = count;
15974         for (i = 0; i < count; i++) {
15975             d->what[i] = ri->data->what[i];
15976             switch (d->what[i]) {
15977                 /* see also regcomp.h and regfree_internal() */
15978             case 'a': /* actually an AV, but the dup function is identical.  */
15979             case 'r':
15980             case 's':
15981             case 'S':
15982             case 'u': /* actually an HV, but the dup function is identical.  */
15983                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15984                 break;
15985             case 'f':
15986                 /* This is cheating. */
15987                 Newx(d->data[i], 1, regnode_ssc);
15988                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
15989                 reti->regstclass = (regnode*)d->data[i];
15990                 break;
15991             case 'T':
15992                 /* Trie stclasses are readonly and can thus be shared
15993                  * without duplication. We free the stclass in pregfree
15994                  * when the corresponding reg_ac_data struct is freed.
15995                  */
15996                 reti->regstclass= ri->regstclass;
15997                 /* Fall through */
15998             case 't':
15999                 OP_REFCNT_LOCK;
16000                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16001                 OP_REFCNT_UNLOCK;
16002                 /* Fall through */
16003             case 'l':
16004             case 'L':
16005                 d->data[i] = ri->data->data[i];
16006                 break;
16007             default:
16008                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16009             }
16010         }
16011
16012         reti->data = d;
16013     }
16014     else
16015         reti->data = NULL;
16016
16017     reti->name_list_idx = ri->name_list_idx;
16018
16019 #ifdef RE_TRACK_PATTERN_OFFSETS
16020     if (ri->u.offsets) {
16021         Newx(reti->u.offsets, 2*len+1, U32);
16022         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16023     }
16024 #else
16025     SetProgLen(reti,len);
16026 #endif
16027
16028     return (void*)reti;
16029 }
16030
16031 #endif    /* USE_ITHREADS */
16032
16033 #ifndef PERL_IN_XSUB_RE
16034
16035 /*
16036  - regnext - dig the "next" pointer out of a node
16037  */
16038 regnode *
16039 Perl_regnext(pTHX_ regnode *p)
16040 {
16041     dVAR;
16042     I32 offset;
16043
16044     if (!p)
16045         return(NULL);
16046
16047     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16048         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16049     }
16050
16051     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16052     if (offset == 0)
16053         return(NULL);
16054
16055     return(p+offset);
16056 }
16057 #endif
16058
16059 STATIC void
16060 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16061 {
16062     va_list args;
16063     STRLEN l1 = strlen(pat1);
16064     STRLEN l2 = strlen(pat2);
16065     char buf[512];
16066     SV *msv;
16067     const char *message;
16068
16069     PERL_ARGS_ASSERT_RE_CROAK2;
16070
16071     if (l1 > 510)
16072         l1 = 510;
16073     if (l1 + l2 > 510)
16074         l2 = 510 - l1;
16075     Copy(pat1, buf, l1 , char);
16076     Copy(pat2, buf + l1, l2 , char);
16077     buf[l1 + l2] = '\n';
16078     buf[l1 + l2 + 1] = '\0';
16079     va_start(args, pat2);
16080     msv = vmess(buf, &args);
16081     va_end(args);
16082     message = SvPV_const(msv,l1);
16083     if (l1 > 512)
16084         l1 = 512;
16085     Copy(message, buf, l1 , char);
16086     /* l1-1 to avoid \n */
16087     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16088 }
16089
16090 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16091
16092 #ifndef PERL_IN_XSUB_RE
16093 void
16094 Perl_save_re_context(pTHX)
16095 {
16096     dVAR;
16097
16098     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16099     if (PL_curpm) {
16100         const REGEXP * const rx = PM_GETRE(PL_curpm);
16101         if (rx) {
16102             U32 i;
16103             for (i = 1; i <= RX_NPARENS(rx); i++) {
16104                 char digits[TYPE_CHARS(long)];
16105                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16106                 GV *const *const gvp
16107                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16108
16109                 if (gvp) {
16110                     GV * const gv = *gvp;
16111                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16112                         save_scalar(gv);
16113                 }
16114             }
16115         }
16116     }
16117 }
16118 #endif
16119
16120 #ifdef DEBUGGING
16121
16122 STATIC void
16123 S_put_byte(pTHX_ SV *sv, int c)
16124 {
16125     PERL_ARGS_ASSERT_PUT_BYTE;
16126
16127     /* Our definition of isPRINT() ignores locales, so only bytes that are
16128        not part of UTF-8 are considered printable. I assume that the same
16129        holds for UTF-EBCDIC.
16130        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
16131        which Wikipedia says:
16132
16133        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
16134        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
16135        identical, to the ASCII delete (DEL) or rubout control character. ...
16136        it is typically mapped to hexadecimal code 9F, in order to provide a
16137        unique character mapping in both directions)
16138
16139        So the old condition can be simplified to !isPRINT(c)  */
16140     if (!isPRINT(c)) {
16141         switch (c) {
16142             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16143             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16144             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16145             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16146             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16147
16148             default:
16149                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16150                 break;
16151         }
16152     }
16153     else {
16154         const char string = c;
16155         if (c == '-' || c == ']' || c == '\\' || c == '^')
16156             sv_catpvs(sv, "\\");
16157         sv_catpvn(sv, &string, 1);
16158     }
16159 }
16160
16161 STATIC bool
16162 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16163 {
16164     /* Appends to 'sv' a displayable version of the innards of the bracketed
16165      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16166      * output anything */
16167
16168     int i;
16169     int rangestart = -1;
16170     bool has_output_anything = FALSE;
16171
16172     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16173
16174     for (i = 0; i <= 256; i++) {
16175         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16176             if (rangestart == -1)
16177                 rangestart = i;
16178         } else if (rangestart != -1) {
16179             int j = i - 1;
16180             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
16181                 for (; rangestart < i; rangestart++)
16182                     put_byte(sv, rangestart);
16183             }
16184             else if (   j > 255
16185                      || ! isALPHANUMERIC(rangestart)
16186                      || ! isALPHANUMERIC(j)
16187                      || isDIGIT(rangestart) != isDIGIT(j)
16188                      || isUPPER(rangestart) != isUPPER(j)
16189                      || isLOWER(rangestart) != isLOWER(j)
16190
16191                         /* This final test should get optimized out except
16192                          * on EBCDIC platforms, where it causes ranges that
16193                          * cross discontinuities like i/j to be shown as hex
16194                          * instead of the misleading, e.g. H-K (since that
16195                          * range includes more than H, I, J, K). */
16196                      || (j - rangestart)
16197                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16198             {
16199                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16200                                rangestart,
16201                                (j < 256) ? j : 255);
16202             }
16203             else { /* Here, the ends of the range are both digits, or both
16204                       uppercase, or both lowercase; and there's no
16205                       discontinuity in the range (which could happen on EBCDIC
16206                       platforms) */
16207                 put_byte(sv, rangestart);
16208                 sv_catpvs(sv, "-");
16209                 put_byte(sv, j);
16210             }
16211             rangestart = -1;
16212             has_output_anything = TRUE;
16213         }
16214     }
16215
16216     return has_output_anything;
16217 }
16218
16219 #define CLEAR_OPTSTART \
16220     if (optstart) STMT_START { \
16221             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16222             optstart=NULL; \
16223     } STMT_END
16224
16225 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16226
16227 STATIC const regnode *
16228 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16229             const regnode *last, const regnode *plast,
16230             SV* sv, I32 indent, U32 depth)
16231 {
16232     dVAR;
16233     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16234     const regnode *next;
16235     const regnode *optstart= NULL;
16236
16237     RXi_GET_DECL(r,ri);
16238     GET_RE_DEBUG_FLAGS_DECL;
16239
16240     PERL_ARGS_ASSERT_DUMPUNTIL;
16241
16242 #ifdef DEBUG_DUMPUNTIL
16243     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16244         last ? last-start : 0,plast ? plast-start : 0);
16245 #endif
16246
16247     if (plast && plast < last)
16248         last= plast;
16249
16250     while (PL_regkind[op] != END && (!last || node < last)) {
16251         /* While that wasn't END last time... */
16252         NODE_ALIGN(node);
16253         op = OP(node);
16254         if (op == CLOSE || op == WHILEM)
16255             indent--;
16256         next = regnext((regnode *)node);
16257
16258         /* Where, what. */
16259         if (OP(node) == OPTIMIZED) {
16260             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16261                 optstart = node;
16262             else
16263                 goto after_print;
16264         } else
16265             CLEAR_OPTSTART;
16266
16267         regprop(r, sv, node);
16268         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16269                       (int)(2*indent + 1), "", SvPVX_const(sv));
16270
16271         if (OP(node) != OPTIMIZED) {
16272             if (next == NULL)           /* Next ptr. */
16273                 PerlIO_printf(Perl_debug_log, " (0)");
16274             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16275                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16276             else
16277                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16278             (void)PerlIO_putc(Perl_debug_log, '\n');
16279         }
16280
16281       after_print:
16282         if (PL_regkind[(U8)op] == BRANCHJ) {
16283             assert(next);
16284             {
16285                 const regnode *nnode = (OP(next) == LONGJMP
16286                                        ? regnext((regnode *)next)
16287                                        : next);
16288                 if (last && nnode > last)
16289                     nnode = last;
16290                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16291             }
16292         }
16293         else if (PL_regkind[(U8)op] == BRANCH) {
16294             assert(next);
16295             DUMPUNTIL(NEXTOPER(node), next);
16296         }
16297         else if ( PL_regkind[(U8)op]  == TRIE ) {
16298             const regnode *this_trie = node;
16299             const char op = OP(node);
16300             const U32 n = ARG(node);
16301             const reg_ac_data * const ac = op>=AHOCORASICK ?
16302                (reg_ac_data *)ri->data->data[n] :
16303                NULL;
16304             const reg_trie_data * const trie =
16305                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16306 #ifdef DEBUGGING
16307             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16308 #endif
16309             const regnode *nextbranch= NULL;
16310             I32 word_idx;
16311             sv_setpvs(sv, "");
16312             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16313                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16314
16315                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16316                    (int)(2*(indent+3)), "",
16317                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16318                             PL_colors[0], PL_colors[1],
16319                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16320                             PERL_PV_PRETTY_ELLIPSES    |
16321                             PERL_PV_PRETTY_LTGT
16322                             )
16323                             : "???"
16324                 );
16325                 if (trie->jump) {
16326                     U16 dist= trie->jump[word_idx+1];
16327                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16328                                   (UV)((dist ? this_trie + dist : next) - start));
16329                     if (dist) {
16330                         if (!nextbranch)
16331                             nextbranch= this_trie + trie->jump[0];
16332                         DUMPUNTIL(this_trie + dist, nextbranch);
16333                     }
16334                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16335                         nextbranch= regnext((regnode *)nextbranch);
16336                 } else {
16337                     PerlIO_printf(Perl_debug_log, "\n");
16338                 }
16339             }
16340             if (last && next > last)
16341                 node= last;
16342             else
16343                 node= next;
16344         }
16345         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16346             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16347                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16348         }
16349         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16350             assert(next);
16351             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16352         }
16353         else if ( op == PLUS || op == STAR) {
16354             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16355         }
16356         else if (PL_regkind[(U8)op] == ANYOF) {
16357             /* arglen 1 + class block */
16358             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16359                     ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16360             node = NEXTOPER(node);
16361         }
16362         else if (PL_regkind[(U8)op] == EXACT) {
16363             /* Literal string, where present. */
16364             node += NODE_SZ_STR(node) - 1;
16365             node = NEXTOPER(node);
16366         }
16367         else {
16368             node = NEXTOPER(node);
16369             node += regarglen[(U8)op];
16370         }
16371         if (op == CURLYX || op == OPEN)
16372             indent++;
16373     }
16374     CLEAR_OPTSTART;
16375 #ifdef DEBUG_DUMPUNTIL
16376     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16377 #endif
16378     return node;
16379 }
16380
16381 #endif  /* DEBUGGING */
16382
16383 /*
16384  * Local variables:
16385  * c-indentation-style: bsd
16386  * c-basic-offset: 4
16387  * indent-tabs-mode: nil
16388  * End:
16389  *
16390  * ex: set ts=8 sts=4 sw=4 et:
16391  */