regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102
 103 struct RExC_state_t {
 104     U32         flags;                  /* RXf_* are we folding, multilining? */
 105     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 106     char        *precomp;               /* uncompiled string. */
 107     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 108     regexp      *rx;                    /* perl core regexp structure */
 109     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 110     char        *start;                 /* Start of input for compile */
 111     char        *end;                   /* End of input for compile */
 112     char        *parse;                 /* Input-scan pointer. */
 113     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 114     regnode     *emit_start;            /* Start of emitted-code area */
 115     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 116     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 117                                            implies compiling, so don't emit */
 118     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 119                                            large enough for the largest
 120                                            non-EXACTish node, so can use it as
 121                                            scratch in pass1 */
 122     I32         naughty;                /* How bad is this pattern? */
 123     I32         sawback;                /* Did we see \1, ...? */
 124     U32         seen;
 125     SSize_t     size;                   /* Code size. */
 126     I32                npar;                        /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
 127     I32         nestroot;               /* root parens we are in - used by accept */
 128     I32         extralen;
 129     I32         seen_zerolen;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved through */
 145     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 146     I32         in_lookbehind;
 147     I32         contains_locale;
 148     I32         contains_i;
 149     I32         override_recoding;
 150     I32         in_multi_char_class;
 151     struct reg_code_block *code_blocks; /* positions of literal (?{})
 152                                             within pattern */
 153     int         num_code_blocks;        /* size of code_blocks[] */
 154     int         code_index;             /* next code_blocks[] slot */
 155 #if ADD_TO_REGEXEC
 156     char        *starttry;              /* -Dr: where regtry was called. */
 157 #define RExC_starttry   (pRExC_state->starttry)
 158 #endif
 159     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 160 #ifdef DEBUGGING
 161     const char  *lastparse;
 162     I32         lastnum;
 163     AV          *paren_name_list;       /* idx -> name */
 164 #define RExC_lastparse  (pRExC_state->lastparse)
 165 #define RExC_lastnum    (pRExC_state->lastnum)
 166 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 167 #endif
 168 };
 169
 170 #define RExC_flags      (pRExC_state->flags)
 171 #define RExC_pm_flags   (pRExC_state->pm_flags)
 172 #define RExC_precomp    (pRExC_state->precomp)
 173 #define RExC_rx_sv      (pRExC_state->rx_sv)
 174 #define RExC_rx         (pRExC_state->rx)
 175 #define RExC_rxi        (pRExC_state->rxi)
 176 #define RExC_start      (pRExC_state->start)
 177 #define RExC_end        (pRExC_state->end)
 178 #define RExC_parse      (pRExC_state->parse)
 179 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 180 #ifdef RE_TRACK_PATTERN_OFFSETS
 181 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 182 #endif
 183 #define RExC_emit       (pRExC_state->emit)
 184 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 185 #define RExC_emit_start (pRExC_state->emit_start)
 186 #define RExC_emit_bound (pRExC_state->emit_bound)
 187 #define RExC_naughty    (pRExC_state->naughty)
 188 #define RExC_sawback    (pRExC_state->sawback)
 189 #define RExC_seen       (pRExC_state->seen)
 190 #define RExC_size       (pRExC_state->size)
 191 #define RExC_npar       (pRExC_state->npar)
 192 #define RExC_nestroot   (pRExC_state->nestroot)
 193 #define RExC_extralen   (pRExC_state->extralen)
 194 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 195 #define RExC_utf8       (pRExC_state->utf8)
 196 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 197 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 198 #define RExC_open_parens        (pRExC_state->open_parens)
 199 #define RExC_close_parens       (pRExC_state->close_parens)
 200 #define RExC_opend      (pRExC_state->opend)
 201 #define RExC_paren_names        (pRExC_state->paren_names)
 202 #define RExC_recurse    (pRExC_state->recurse)
 203 #define RExC_recurse_count      (pRExC_state->recurse_count)
 204 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 205 #define RExC_study_chunk_recursed_bytes        (pRExC_state->study_chunk_recursed_bytes)
 206 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 207 #define RExC_contains_locale    (pRExC_state->contains_locale)
 208 #define RExC_contains_i (pRExC_state->contains_i)
 209 #define RExC_override_recoding (pRExC_state->override_recoding)
 210 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 211
 212
 213 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 214 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 215         ((*s) == '{' && regcurly(s, FALSE)))
 216
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 224  * character.  (There needs to be a case: in the switch statement in regexec.c
 225  * for any node marked SIMPLE.)  Note that this is not the same thing as
 226  * REGNODE_SIMPLE */
 227 #define SIMPLE          0x02
 228 #define SPSTART         0x04    /* Starts with * or + */
 229 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 230 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 231 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 232
 233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 234
 235 /* whether trie related optimizations are enabled */
 236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 237 #define TRIE_STUDY_OPT
 238 #define FULL_TRIE_STUDY
 239 #define TRIE_STCLASS
 240 #endif
 241
 242
 243
 244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 245 #define PBITVAL(paren) (1 << ((paren) & 7))
 246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 249
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (!UTF) {                           \
 252                                          *flagp = RESTART_UTF8;            \
 253                                          return NULL;                      \
 254                                      }                                     \
 255                         } STMT_END
 256
 257 /* This converts the named class defined in regcomp.h to its equivalent class
 258  * number defined in handy.h. */
 259 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 260 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 261
 262 #define _invlist_union_complement_2nd(a, b, output) \
 263                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 264 #define _invlist_intersection_complement_2nd(a, b, output) \
 265                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277     /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285      /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292     This is the position the string must appear at, or not before.
 293     It also implicitly (when combined with minlenp) tells us how many
 294     characters must match before the string we are searching for.
 295     Likewise when combined with minlenp and the length of the string it
 296     tells us how many characters must appear after the string we have
 297     found.
 298
 299   - max_offset
 300     Only used for floating strings. This is the rightmost point that
 301     the string can appear at. If set to SSize_t_MAX it indicates that the
 302     string can occur infinitely far to the right.
 303
 304   - minlenp
 305     A pointer to the minimum number of characters of the pattern that the
 306     string was found inside. This is important as in the case of positive
 307     lookahead or positive lookbehind we can have multiple patterns
 308     involved. Consider
 309
 310     /(?=FOO).*F/
 311
 312     The minimum length of the pattern overall is 3, the minimum length
 313     of the lookahead part is 3, but the minimum length of the part that
 314     will actually match is 1. So 'FOO's minimum length is 3, but the
 315     minimum length for the F is 1. This is important as the minimum length
 316     is used to determine offsets in front of and behind the string being
 317     looked for.  Since strings can be composites this is the length of the
 318     pattern at the time it was committed with a scan_commit. Note that
 319     the length is calculated by study_chunk, so that the minimum lengths
 320     are not known until the full pattern has been compiled, thus the
 321     pointer to the value.
 322
 323   - lookbehind
 324
 325     In the case of lookbehind the string being searched for can be
 326     offset past the start point of the final matching string.
 327     If this value was just blithely removed from the min_offset it would
 328     invalidate some of the calculations for how many chars must match
 329     before or after (as they are derived from min_offset and minlen and
 330     the length of the string being searched for).
 331     When the final pattern is compiled and the data is moved from the
 332     scan_data_t structure into the regexp structure the information
 333     about lookbehind is factored in, with the information that would
 334     have been lost precalculated in the end_shift field for the
 335     associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343     /*I32 len_min;      unused */
 344     /*I32 len_delta;    unused */
 345     SSize_t pos_min;
 346     SSize_t pos_delta;
 347     SV *last_found;
 348     SSize_t last_end;       /* min value, <0 unless valid. */
 349     SSize_t last_start_min;
 350     SSize_t last_start_max;
 351     SV **longest;           /* Either &l_fixed, or &l_float. */
 352     SV *longest_fixed;      /* longest fixed string found in pattern */
 353     SSize_t offset_fixed;   /* offset where it starts */
 354     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 355     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356     SV *longest_float;      /* longest floating string found in pattern */
 357     SSize_t offset_float_min; /* earliest point in string it can appear */
 358     SSize_t offset_float_max; /* latest point in string it can appear */
 359     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 360     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 361     I32 flags;
 362     I32 whilem_c;
 363     SSize_t *last_closep;
 364     regnode_ssc *start_class;
 365 } scan_data_t;
 366
 367 /* The below is perhaps overboard, but this allows us to save a test at the
 368  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 369  * and 'a' differ by a single bit; the same with the upper and lower case of
 370  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 371  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 372  * then inverts it to form a mask, with just a single 0, in the bit position
 373  * where the upper- and lowercase differ.  XXX There are about 40 other
 374  * instances in the Perl core where this micro-optimization could be used.
 375  * Should decide if maintenance cost is worse, before changing those
 376  *
 377  * Returns a boolean as to whether or not 'v' is either a lowercase or
 378  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 379  * compile-time constant, the generated code is better than some optimizing
 380  * compilers figure out, amounting to a mask and test.  The results are
 381  * meaningless if 'c' is not one of [A-Za-z] */
 382 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 383                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 384
 385 /*
 386  * Forward declarations for pregcomp()'s friends.
 387  */
 388
 389 static const scan_data_t zero_scan_data =
 390   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 391
 392 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 393 #define SF_BEFORE_SEOL          0x0001
 394 #define SF_BEFORE_MEOL          0x0002
 395 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 396 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 397
 398 #define SF_FIX_SHIFT_EOL        (+2)
 399 #define SF_FL_SHIFT_EOL         (+4)
 400
 401 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 402 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 403
 404 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 405 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 406 #define SF_IS_INF               0x0040
 407 #define SF_HAS_PAR              0x0080
 408 #define SF_IN_PAR               0x0100
 409 #define SF_HAS_EVAL             0x0200
 410 #define SCF_DO_SUBSTR           0x0400
 411 #define SCF_DO_STCLASS_AND      0x0800
 412 #define SCF_DO_STCLASS_OR       0x1000
 413 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 414 #define SCF_WHILEM_VISITED_POS  0x2000
 415
 416 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 417 #define SCF_SEEN_ACCEPT         0x8000
 418 #define SCF_TRIE_DOING_RESTUDY 0x10000
 419
 420 #define UTF cBOOL(RExC_utf8)
 421
 422 /* The enums for all these are ordered so things work out correctly */
 423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 425 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 426 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 427 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 428 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 429 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 430
 431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 432
 433 /* For programs that want to be strictly Unicode compatible by dying if any
 434  * attempt is made to match a non-Unicode code point against a Unicode
 435  * property.  */
 436 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 437
 438 #define OOB_NAMEDCLASS          -1
 439
 440 /* There is no code point that is out-of-bounds, so this is problematic.  But
 441  * its only current use is to initialize a variable that is always set before
 442  * looked at. */
 443 #define OOB_UNICODE             0xDEADBEEF
 444
 445 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 446 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 447
 448
 449 /* length of regex to show in messages that don't mark a position within */
 450 #define RegexLengthToShowInErrorMessages 127
 451
 452 /*
 453  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 454  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 455  * op/pragma/warn/regcomp.
 456  */
 457 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 458 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 459
 460 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 461
 462 #define REPORT_LOCATION_ARGS(offset)            \
 463                 UTF8fARG(UTF, offset, RExC_precomp), \
 464                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 468  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 469  * "...".
 470  */
 471 #define _FAIL(code) STMT_START {                                        \
 472     const char *ellipses = "";                                          \
 473     IV len = RExC_end - RExC_precomp;                                   \
 474                                                                         \
 475     if (!SIZE_ONLY)                                                     \
 476         SAVEFREESV(RExC_rx_sv);                                         \
 477     if (len > RegexLengthToShowInErrorMessages) {                       \
 478         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 479         len = RegexLengthToShowInErrorMessages - 10;                    \
 480         ellipses = "...";                                               \
 481     }                                                                   \
 482     code;                                                               \
 483 } STMT_END
 484
 485 #define FAIL(msg) _FAIL(                            \
 486     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 487             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 488
 489 #define FAIL2(msg,arg) _FAIL(                       \
 490     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 491             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 492
 493 /*
 494  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 495  */
 496 #define Simple_vFAIL(m) STMT_START {                                    \
 497     const IV offset = RExC_parse - RExC_precomp;                        \
 498     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 499             m, REPORT_LOCATION_ARGS(offset));   \
 500 } STMT_END
 501
 502 /*
 503  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 504  */
 505 #define vFAIL(m) STMT_START {                           \
 506     if (!SIZE_ONLY)                                     \
 507         SAVEFREESV(RExC_rx_sv);                         \
 508     Simple_vFAIL(m);                                    \
 509 } STMT_END
 510
 511 /*
 512  * Like Simple_vFAIL(), but accepts two arguments.
 513  */
 514 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 515     const IV offset = RExC_parse - RExC_precomp;                        \
 516     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 517                       REPORT_LOCATION_ARGS(offset));    \
 518 } STMT_END
 519
 520 /*
 521  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 522  */
 523 #define vFAIL2(m,a1) STMT_START {                       \
 524     if (!SIZE_ONLY)                                     \
 525         SAVEFREESV(RExC_rx_sv);                         \
 526     Simple_vFAIL2(m, a1);                               \
 527 } STMT_END
 528
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts three arguments.
 532  */
 533 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 534     const IV offset = RExC_parse - RExC_precomp;                \
 535     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 536             REPORT_LOCATION_ARGS(offset));      \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 541  */
 542 #define vFAIL3(m,a1,a2) STMT_START {                    \
 543     if (!SIZE_ONLY)                                     \
 544         SAVEFREESV(RExC_rx_sv);                         \
 545     Simple_vFAIL3(m, a1, a2);                           \
 546 } STMT_END
 547
 548 /*
 549  * Like Simple_vFAIL(), but accepts four arguments.
 550  */
 551 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 552     const IV offset = RExC_parse - RExC_precomp;                \
 553     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 554             REPORT_LOCATION_ARGS(offset));      \
 555 } STMT_END
 556
 557 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 558     if (!SIZE_ONLY)                                     \
 559         SAVEFREESV(RExC_rx_sv);                         \
 560     Simple_vFAIL4(m, a1, a2, a3);                       \
 561 } STMT_END
 562
 563 /* A specialized version of vFAIL2 that works with UTF8f */
 564 #define vFAIL2utf8f(m, a1) STMT_START { \
 565     const IV offset = RExC_parse - RExC_precomp;   \
 566     if (!SIZE_ONLY)                                \
 567         SAVEFREESV(RExC_rx_sv);                    \
 568     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 569             REPORT_LOCATION_ARGS(offset));         \
 570 } STMT_END
 571
 572
 573 /* m is not necessarily a "literal string", in this macro */
 574 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 575     const IV offset = loc - RExC_precomp;                               \
 576     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 577             m, REPORT_LOCATION_ARGS(offset));       \
 578 } STMT_END
 579
 580 #define ckWARNreg(loc,m) STMT_START {                                   \
 581     const IV offset = loc - RExC_precomp;                               \
 582     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 583             REPORT_LOCATION_ARGS(offset));              \
 584 } STMT_END
 585
 586 #define vWARN_dep(loc, m) STMT_START {                                  \
 587     const IV offset = loc - RExC_precomp;                               \
 588     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 589             REPORT_LOCATION_ARGS(offset));              \
 590 } STMT_END
 591
 592 #define ckWARNdep(loc,m) STMT_START {                                   \
 593     const IV offset = loc - RExC_precomp;                               \
 594     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 595             m REPORT_LOCATION,                                          \
 596             REPORT_LOCATION_ARGS(offset));              \
 597 } STMT_END
 598
 599 #define ckWARNregdep(loc,m) STMT_START {                                \
 600     const IV offset = loc - RExC_precomp;                               \
 601     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 602             m REPORT_LOCATION,                                          \
 603             REPORT_LOCATION_ARGS(offset));              \
 604 } STMT_END
 605
 606 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 607     const IV offset = loc - RExC_precomp;                               \
 608     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 609             m REPORT_LOCATION,                                          \
 610             a1, REPORT_LOCATION_ARGS(offset));  \
 611 } STMT_END
 612
 613 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 614     const IV offset = loc - RExC_precomp;                               \
 615     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 616             a1, REPORT_LOCATION_ARGS(offset));  \
 617 } STMT_END
 618
 619 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 620     const IV offset = loc - RExC_precomp;                               \
 621     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 622             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 623 } STMT_END
 624
 625 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 626     const IV offset = loc - RExC_precomp;                               \
 627     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 628             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 629 } STMT_END
 630
 631 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 632     const IV offset = loc - RExC_precomp;                               \
 633     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 634             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 635 } STMT_END
 636
 637 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 638     const IV offset = loc - RExC_precomp;                               \
 639     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 640             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 641 } STMT_END
 642
 643 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 644     const IV offset = loc - RExC_precomp;                               \
 645     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 646             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 647 } STMT_END
 648
 649
 650 /* Allow for side effects in s */
 651 #define REGC(c,s) STMT_START {                  \
 652     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 653 } STMT_END
 654
 655 /* Macros for recording node offsets.   20001227 mjd@plover.com
 656  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 657  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 658  * Element 0 holds the number n.
 659  * Position is 1 indexed.
 660  */
 661 #ifndef RE_TRACK_PATTERN_OFFSETS
 662 #define Set_Node_Offset_To_R(node,byte)
 663 #define Set_Node_Offset(node,byte)
 664 #define Set_Cur_Node_Offset
 665 #define Set_Node_Length_To_R(node,len)
 666 #define Set_Node_Length(node,len)
 667 #define Set_Node_Cur_Length(node,start)
 668 #define Node_Offset(n)
 669 #define Node_Length(n)
 670 #define Set_Node_Offset_Length(node,offset,len)
 671 #define ProgLen(ri) ri->u.proglen
 672 #define SetProgLen(ri,x) ri->u.proglen = x
 673 #else
 674 #define ProgLen(ri) ri->u.offsets[0]
 675 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 676 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 677     if (! SIZE_ONLY) {                                                  \
 678         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 679                     __LINE__, (int)(node), (int)(byte)));               \
 680         if((node) < 0) {                                                \
 681             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 682         } else {                                                        \
 683             RExC_offsets[2*(node)-1] = (byte);                          \
 684         }                                                               \
 685     }                                                                   \
 686 } STMT_END
 687
 688 #define Set_Node_Offset(node,byte) \
 689     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 690 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 691
 692 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 693     if (! SIZE_ONLY) {                                                  \
 694         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 695                 __LINE__, (int)(node), (int)(len)));                    \
 696         if((node) < 0) {                                                \
 697             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 698         } else {                                                        \
 699             RExC_offsets[2*(node)] = (len);                             \
 700         }                                                               \
 701     }                                                                   \
 702 } STMT_END
 703
 704 #define Set_Node_Length(node,len) \
 705     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 706 #define Set_Node_Cur_Length(node, start)                \
 707     Set_Node_Length(node, RExC_parse - start)
 708
 709 /* Get offsets and lengths */
 710 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 711 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 712
 713 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 714     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 715     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 716 } STMT_END
 717 #endif
 718
 719 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 720 #define EXPERIMENTAL_INPLACESCAN
 721 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 722
 723 #define DEBUG_RExC_seen() \
 724         DEBUG_OPTIMISE_MORE_r({                                                     \
 725             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                            \
 726                                                                                     \
 727             if (RExC_seen & REG_SEEN_ZERO_LEN)                                      \
 728                 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN ");                 \
 729                                                                                     \
 730             if (RExC_seen & REG_SEEN_LOOKBEHIND)                                    \
 731                 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND ");               \
 732                                                                                     \
 733             if (RExC_seen & REG_SEEN_GPOS)                                          \
 734                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS ");                     \
 735                                                                                     \
 736             if (RExC_seen & REG_SEEN_CANY)                                            \
 737                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY ");                     \
 738                                                                                     \
 739             if (RExC_seen & REG_SEEN_RECURSE)                                       \
 740                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE ");                  \
 741                                                                                     \
 742             if (RExC_seen & REG_TOP_LEVEL_BRANCHES)                                 \
 743                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES ");            \
 744                                                                                     \
 745             if (RExC_seen & REG_SEEN_VERBARG)                                       \
 746                 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG ");                  \
 747                                                                                     \
 748             if (RExC_seen & REG_SEEN_CUTGROUP)                                      \
 749                 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP ");                 \
 750                                                                                     \
 751             if (RExC_seen & REG_SEEN_RUN_ON_COMMENT)                                \
 752                 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT ");           \
 753                                                                                     \
 754             if (RExC_seen & REG_SEEN_EXACTF_SHARP_S)                                \
 755                 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S ");           \
 756                                                                                     \
 757             if (RExC_seen & REG_SEEN_GOSTART)                                       \
 758                 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART ");                  \
 759                                                                                     \
 760             PerlIO_printf(Perl_debug_log,"\n");                                     \
 761         });
 762
 763 #define DEBUG_STUDYDATA(str,data,depth)                              \
 764 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 765     PerlIO_printf(Perl_debug_log,                                    \
 766         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 767         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 768         (int)(depth)*2, "",                                          \
 769         (IV)((data)->pos_min),                                       \
 770         (IV)((data)->pos_delta),                                     \
 771         (UV)((data)->flags),                                         \
 772         (IV)((data)->whilem_c),                                      \
 773         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 774         is_inf ? "INF " : ""                                         \
 775     );                                                               \
 776     if ((data)->last_found)                                          \
 777         PerlIO_printf(Perl_debug_log,                                \
 778             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 779             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 780             SvPVX_const((data)->last_found),                         \
 781             (IV)((data)->last_end),                                  \
 782             (IV)((data)->last_start_min),                            \
 783             (IV)((data)->last_start_max),                            \
 784             ((data)->longest &&                                      \
 785              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 786             SvPVX_const((data)->longest_fixed),                      \
 787             (IV)((data)->offset_fixed),                              \
 788             ((data)->longest &&                                      \
 789              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 790             SvPVX_const((data)->longest_float),                      \
 791             (IV)((data)->offset_float_min),                          \
 792             (IV)((data)->offset_float_max)                           \
 793         );                                                           \
 794     PerlIO_printf(Perl_debug_log,"\n");                              \
 795 });
 796
 797 /* Mark that we cannot extend a found fixed substring at this point.
 798    Update the longest found anchored substring and the longest found
 799    floating substrings if needed. */
 800
 801 STATIC void
 802 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 803                     SSize_t *minlenp, int is_inf)
 804 {
 805     const STRLEN l = CHR_SVLEN(data->last_found);
 806     const STRLEN old_l = CHR_SVLEN(*data->longest);
 807     GET_RE_DEBUG_FLAGS_DECL;
 808
 809     PERL_ARGS_ASSERT_SCAN_COMMIT;
 810
 811     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 812         SvSetMagicSV(*data->longest, data->last_found);
 813         if (*data->longest == data->longest_fixed) {
 814             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 815             if (data->flags & SF_BEFORE_EOL)
 816                 data->flags
 817                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 818             else
 819                 data->flags &= ~SF_FIX_BEFORE_EOL;
 820             data->minlen_fixed=minlenp;
 821             data->lookbehind_fixed=0;
 822         }
 823         else { /* *data->longest == data->longest_float */
 824             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 825             data->offset_float_max = (l
 826                                       ? data->last_start_max
 827                                       : (data->pos_delta == SSize_t_MAX
 828                                          ? SSize_t_MAX
 829                                          : data->pos_min + data->pos_delta));
 830             if (is_inf
 831                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 832                 data->offset_float_max = SSize_t_MAX;
 833             if (data->flags & SF_BEFORE_EOL)
 834                 data->flags
 835                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 836             else
 837                 data->flags &= ~SF_FL_BEFORE_EOL;
 838             data->minlen_float=minlenp;
 839             data->lookbehind_float=0;
 840         }
 841     }
 842     SvCUR_set(data->last_found, 0);
 843     {
 844         SV * const sv = data->last_found;
 845         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 846             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 847             if (mg)
 848                 mg->mg_len = 0;
 849         }
 850     }
 851     data->last_end = -1;
 852     data->flags &= ~SF_BEFORE_EOL;
 853     DEBUG_STUDYDATA("commit: ",data,0);
 854 }
 855
 856 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 857  * list that describes which code points it matches */
 858
 859 STATIC void
 860 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 861 {
 862     /* Set the SSC 'ssc' to match an empty string or any code point */
 863
 864     PERL_ARGS_ASSERT_SSC_ANYTHING;
 865
 866     assert(OP(ssc) == ANYOF_SYNTHETIC);
 867
 868     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 869     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 870     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 871 }
 872
 873 STATIC int
 874 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 875 {
 876     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 877      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 878      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 879      * in any way, so there's no point in using it */
 880
 881     UV start, end;
 882     bool ret;
 883
 884     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 885
 886     assert(OP(ssc) == ANYOF_SYNTHETIC);
 887
 888     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 889         return FALSE;
 890     }
 891
 892     /* See if the list consists solely of the range 0 - Infinity */
 893     invlist_iterinit(ssc->invlist);
 894     ret = invlist_iternext(ssc->invlist, &start, &end)
 895           && start == 0
 896           && end == UV_MAX;
 897
 898     invlist_iterfinish(ssc->invlist);
 899
 900     if (ret) {
 901         return TRUE;
 902     }
 903
 904     /* If e.g., both \w and \W are set, matches everything */
 905     if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
 906         int i;
 907         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 908             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 909                 return TRUE;
 910             }
 911         }
 912     }
 913
 914     return FALSE;
 915 }
 916
 917 STATIC void
 918 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 919 {
 920     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 921      * string, any code point, or any posix class under locale */
 922
 923     PERL_ARGS_ASSERT_SSC_INIT;
 924
 925     Zero(ssc, 1, regnode_ssc);
 926     OP(ssc) = ANYOF_SYNTHETIC;
 927     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 928     ssc_anything(ssc);
 929
 930     /* If any portion of the regex is to operate under locale rules,
 931      * initialization includes it.  The reason this isn't done for all regexes
 932      * is that the optimizer was written under the assumption that locale was
 933      * all-or-nothing.  Given the complexity and lack of documentation in the
 934      * optimizer, and that there are inadequate test cases for locale, many
 935      * parts of it may not work properly, it is safest to avoid locale unless
 936      * necessary. */
 937     if (RExC_contains_locale) {
 938         ANYOF_POSIXL_SETALL(ssc);
 939         ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
 940         if (RExC_contains_i) {
 941             ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
 942         }
 943     }
 944     else {
 945         ANYOF_POSIXL_ZERO(ssc);
 946     }
 947 }
 948
 949 STATIC int
 950 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 951                               const regnode_ssc *ssc)
 952 {
 953     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 954      * to the list of code points matched, and locale posix classes; hence does
 955      * not check its flags) */
 956
 957     UV start, end;
 958     bool ret;
 959
 960     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 961
 962     assert(OP(ssc) == ANYOF_SYNTHETIC);
 963
 964     invlist_iterinit(ssc->invlist);
 965     ret = invlist_iternext(ssc->invlist, &start, &end)
 966           && start == 0
 967           && end == UV_MAX;
 968
 969     invlist_iterfinish(ssc->invlist);
 970
 971     if (! ret) {
 972         return FALSE;
 973     }
 974
 975     if (RExC_contains_locale) {
 976         if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
 977             || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
 978             || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
 979         {
 980             return FALSE;
 981         }
 982         if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
 983             return FALSE;
 984         }
 985     }
 986
 987     return TRUE;
 988 }
 989
 990 STATIC SV*
 991 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 992                                   const regnode_charclass_posixl* const node)
 993 {
 994     /* Returns a mortal inversion list defining which code points are matched
 995      * by 'node', which is of type ANYOF.  Handles complementing the result if
 996      * appropriate.  If some code points aren't knowable at this time, the
 997      * returned list must, and will, contain every possible code point. */
 998
 999     SV* invlist = sv_2mortal(_new_invlist(0));
1000     unsigned int i;
1001     const U32 n = ARG(node);
1002
1003     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1004
1005     /* Look at the data structure created by S_set_ANYOF_arg() */
1006     if (n != ANYOF_NONBITMAP_EMPTY) {
1007         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1008         AV * const av = MUTABLE_AV(SvRV(rv));
1009         SV **const ary = AvARRAY(av);
1010         assert(RExC_rxi->data->what[n] == 's');
1011
1012         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1013             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1014         }
1015         else if (ary[0] && ary[0] != &PL_sv_undef) {
1016
1017             /* Here, no compile-time swash, and there are things that won't be
1018              * known until runtime -- we have to assume it could be anything */
1019             return _add_range_to_invlist(invlist, 0, UV_MAX);
1020         }
1021         else {
1022
1023             /* Here no compile-time swash, and no run-time only data.  Use the
1024              * node's inversion list */
1025             invlist = sv_2mortal(invlist_clone(ary[2]));
1026         }
1027     }
1028
1029     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1030      * inversion list for the others, but if there are code points that should
1031      * match only conditionally on the target string being UTF-8, those are
1032      * placed in the inversion list, and not the bitmap.  Since there are
1033      * circumstances under which they could match, they are included in the
1034      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1035      * here, so that when we invert below, the end result actually does include
1036      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1037      * before we add the unconditionally matched code points */
1038     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1039         _invlist_intersection_complement_2nd(invlist,
1040                                              PL_UpperLatin1,
1041                                              &invlist);
1042     }
1043
1044     /* Add in the points from the bit map */
1045     for (i = 0; i < 256; i++) {
1046         if (ANYOF_BITMAP_TEST(node, i)) {
1047             invlist = add_cp_to_invlist(invlist, i);
1048         }
1049     }
1050
1051     /* If this can match all upper Latin1 code points, have to add them
1052      * as well */
1053     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1054         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1055     }
1056
1057     /* Similarly for these */
1058     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1059         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1060     }
1061
1062     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1063         _invlist_invert(invlist);
1064     }
1065
1066     return invlist;
1067 }
1068
1069 /* These two functions currently do the exact same thing */
1070 #define ssc_init_zero           ssc_init
1071
1072 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1073 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1074
1075 STATIC void
1076 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1077 {
1078     /* Take the flags 'and_with' and accumulate them anded into the flags for
1079      * the SSC 'ssc'.  The non-SSC related flags in 'and_with' are ignored.
1080      * The flags 'and_with' should not come from another SSC (otherwise the
1081      * EMPTY_STRING flag won't work) */
1082
1083     const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
1084
1085     PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1086
1087     /* Use just the SSC-related flags from 'and_with' */
1088     ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
1089     ANYOF_FLAGS(ssc) |= ssc_only_flags;
1090 }
1091
1092 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1093  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1094  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1095
1096 STATIC void
1097 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1098                 const regnode_ssc *and_with)
1099 {
1100     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1101      * another SSC or a regular ANYOF class.  Can create false positives. */
1102
1103     SV* anded_cp_list;
1104     U8  anded_flags;
1105
1106     PERL_ARGS_ASSERT_SSC_AND;
1107
1108     assert(OP(ssc) == ANYOF_SYNTHETIC);
1109
1110     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1111      * the code point inversion list and just the relevant flags */
1112     if (OP(and_with) == ANYOF_SYNTHETIC) {
1113         anded_cp_list = and_with->invlist;
1114         anded_flags = ANYOF_FLAGS(and_with);
1115
1116         /* XXX This is a kludge around what appears to be deficiencies in the
1117          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1118          * there are paths through the optimizer where it doesn't get weeded
1119          * out when it should.  And if we don't make some extra provision for
1120          * it like the code just below, it doesn't get added when it should.
1121          * This solution is to add it only when AND'ing, which is here, and
1122          * only when what is being AND'ed is the pristine, original node
1123          * matching anything.  Thus it is like adding it to ssc_anything() but
1124          * only when the result is to be AND'ed.  Probably the same solution
1125          * could be adopted for the same problem we have with /l matching,
1126          * which is solved differently in S_ssc_init(), and that would lead to
1127          * fewer false positives than that solution has.  But if this solution
1128          * creates bugs, the consequences are only that a warning isn't raised
1129          * that should be; while the consequences for having /l bugs is
1130          * incorrect matches */
1131         if (ssc_is_anything(and_with)) {
1132             anded_flags |= ANYOF_WARN_SUPER;
1133         }
1134     }
1135     else {
1136         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1137                                         (regnode_charclass_posixl*) and_with);
1138         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1139     }
1140
1141     ANYOF_FLAGS(ssc) &= anded_flags;
1142
1143     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1144      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1145      * 'and_with' may be inverted.  When not inverted, we have the situation of
1146      * computing:
1147      *  (C1 | P1) & (C2 | P2)
1148      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1149      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1150      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1151      *                    <=  ((C1 & C2) | P1 | P2)
1152      * Alternatively, the last few steps could be:
1153      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1154      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1155      *                    <=  (C1 | C2 | (P1 & P2))
1156      * We favor the second approach if either P1 or P2 is non-empty.  This is
1157      * because these components are a barrier to doing optimizations, as what
1158      * they match cannot be known until the moment of matching as they are
1159      * dependent on the current locale, 'AND"ing them likely will reduce or
1160      * eliminate them.
1161      * But we can do better if we know that C1,P1 are in their initial state (a
1162      * frequent occurrence), each matching everything:
1163      *  (<everything>) & (C2 | P2) =  C2 | P2
1164      * Similarly, if C2,P2 are in their initial state (again a frequent
1165      * occurrence), the result is a no-op
1166      *  (C1 | P1) & (<everything>) =  C1 | P1
1167      *
1168      * Inverted, we have
1169      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1170      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1171      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1172      * */
1173
1174     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1175         && OP(and_with) != ANYOF_SYNTHETIC)
1176     {
1177         unsigned int i;
1178
1179         ssc_intersection(ssc,
1180                          anded_cp_list,
1181                          FALSE /* Has already been inverted */
1182                          );
1183
1184         /* If either P1 or P2 is empty, the intersection will be also; can skip
1185          * the loop */
1186         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1187             ANYOF_POSIXL_ZERO(ssc);
1188         }
1189         else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1190
1191             /* Note that the Posix class component P from 'and_with' actually
1192              * looks like:
1193              *      P = Pa | Pb | ... | Pn
1194              * where each component is one posix class, such as in [\w\s].
1195              * Thus
1196              *      ~P = ~(Pa | Pb | ... | Pn)
1197              *         = ~Pa & ~Pb & ... & ~Pn
1198              *        <= ~Pa | ~Pb | ... | ~Pn
1199              * The last is something we can easily calculate, but unfortunately
1200              * is likely to have many false positives.  We could do better
1201              * in some (but certainly not all) instances if two classes in
1202              * P have known relationships.  For example
1203              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1204              * So
1205              *      :lower: & :print: = :lower:
1206              * And similarly for classes that must be disjoint.  For example,
1207              * since \s and \w can have no elements in common based on rules in
1208              * the POSIX standard,
1209              *      \w & ^\S = nothing
1210              * Unfortunately, some vendor locales do not meet the Posix
1211              * standard, in particular almost everything by Microsoft.
1212              * The loop below just changes e.g., \w into \W and vice versa */
1213
1214             regnode_charclass_posixl temp;
1215             int add = 1;    /* To calculate the index of the complement */
1216
1217             ANYOF_POSIXL_ZERO(&temp);
1218             for (i = 0; i < ANYOF_MAX; i++) {
1219                 assert(i % 2 != 0
1220                        || ! ANYOF_POSIXL_TEST(and_with, i)
1221                        || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1222
1223                 if (ANYOF_POSIXL_TEST(and_with, i)) {
1224                     ANYOF_POSIXL_SET(&temp, i + add);
1225                 }
1226                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1227             }
1228             ANYOF_POSIXL_AND(&temp, ssc);
1229
1230         } /* else ssc already has no posixes */
1231     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1232          in its initial state */
1233     else if (OP(and_with) != ANYOF_SYNTHETIC
1234              || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1235     {
1236         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1237          * copy it over 'ssc' */
1238         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1239             if (OP(and_with) == ANYOF_SYNTHETIC) {
1240                 StructCopy(and_with, ssc, regnode_ssc);
1241             }
1242             else {
1243                 ssc->invlist = anded_cp_list;
1244                 ANYOF_POSIXL_ZERO(ssc);
1245                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1246                     ANYOF_POSIXL_OR(and_with, ssc);
1247                 }
1248             }
1249         }
1250         else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1251                     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1252         {
1253             /* One or the other of P1, P2 is non-empty. */
1254             ANYOF_POSIXL_AND(and_with, ssc);
1255             ssc_union(ssc, anded_cp_list, FALSE);
1256         }
1257         else { /* P1 = P2 = empty */
1258             ssc_intersection(ssc, anded_cp_list, FALSE);
1259         }
1260     }
1261 }
1262
1263 STATIC void
1264 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1265                const regnode_ssc *or_with)
1266 {
1267     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1268      * another SSC or a regular ANYOF class.  Can create false positives if
1269      * 'or_with' is to be inverted. */
1270
1271     SV* ored_cp_list;
1272     U8 ored_flags;
1273
1274     PERL_ARGS_ASSERT_SSC_OR;
1275
1276     assert(OP(ssc) == ANYOF_SYNTHETIC);
1277
1278     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1279      * the code point inversion list and just the relevant flags */
1280     if (OP(or_with) == ANYOF_SYNTHETIC) {
1281         ored_cp_list = or_with->invlist;
1282         ored_flags = ANYOF_FLAGS(or_with);
1283     }
1284     else {
1285         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1286                                         (regnode_charclass_posixl*) or_with);
1287         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1288     }
1289
1290     ANYOF_FLAGS(ssc) |= ored_flags;
1291
1292     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1293      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1294      * 'or_with' may be inverted.  When not inverted, we have the simple
1295      * situation of computing:
1296      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1297      * If P1|P2 yields a situation with both a class and its complement are
1298      * set, like having both \w and \W, this matches all code points, and we
1299      * can delete these from the P component of the ssc going forward.  XXX We
1300      * might be able to delete all the P components, but I (khw) am not certain
1301      * about this, and it is better to be safe.
1302      *
1303      * Inverted, we have
1304      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1305      *                         <=  (C1 | P1) | ~C2
1306      *                         <=  (C1 | ~C2) | P1
1307      * (which results in actually simpler code than the non-inverted case)
1308      * */
1309
1310     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1311         && OP(or_with) != ANYOF_SYNTHETIC)
1312     {
1313         /* We ignore P2, leaving P1 going forward */
1314     }
1315     else {  /* Not inverted */
1316         ANYOF_POSIXL_OR(or_with, ssc);
1317         if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1318             unsigned int i;
1319             for (i = 0; i < ANYOF_MAX; i += 2) {
1320                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1321                 {
1322                     ssc_match_all_cp(ssc);
1323                     ANYOF_POSIXL_CLEAR(ssc, i);
1324                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1325                     if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1326                         ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1327                     }
1328                 }
1329             }
1330         }
1331     }
1332
1333     ssc_union(ssc,
1334               ored_cp_list,
1335               FALSE /* Already has been inverted */
1336               );
1337 }
1338
1339 PERL_STATIC_INLINE void
1340 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1341 {
1342     PERL_ARGS_ASSERT_SSC_UNION;
1343
1344     assert(OP(ssc) == ANYOF_SYNTHETIC);
1345
1346     _invlist_union_maybe_complement_2nd(ssc->invlist,
1347                                         invlist,
1348                                         invert2nd,
1349                                         &ssc->invlist);
1350 }
1351
1352 PERL_STATIC_INLINE void
1353 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1354                          SV* const invlist,
1355                          const bool invert2nd)
1356 {
1357     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1358
1359     assert(OP(ssc) == ANYOF_SYNTHETIC);
1360
1361     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1362                                                invlist,
1363                                                invert2nd,
1364                                                &ssc->invlist);
1365 }
1366
1367 PERL_STATIC_INLINE void
1368 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1369 {
1370     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1371
1372     assert(OP(ssc) == ANYOF_SYNTHETIC);
1373
1374     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1375 }
1376
1377 PERL_STATIC_INLINE void
1378 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1379 {
1380     /* AND just the single code point 'cp' into the SSC 'ssc' */
1381
1382     SV* cp_list = _new_invlist(2);
1383
1384     PERL_ARGS_ASSERT_SSC_CP_AND;
1385
1386     assert(OP(ssc) == ANYOF_SYNTHETIC);
1387
1388     cp_list = add_cp_to_invlist(cp_list, cp);
1389     ssc_intersection(ssc, cp_list,
1390                      FALSE /* Not inverted */
1391                      );
1392     SvREFCNT_dec_NN(cp_list);
1393 }
1394
1395 PERL_STATIC_INLINE void
1396 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1397 {
1398     /* Set the SSC 'ssc' to not match any locale things */
1399
1400     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1401
1402     assert(OP(ssc) == ANYOF_SYNTHETIC);
1403
1404     ANYOF_POSIXL_ZERO(ssc);
1405     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1406 }
1407
1408 STATIC void
1409 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1410 {
1411     /* The inversion list in the SSC is marked mortal; now we need a more
1412      * permanent copy, which is stored the same way that is done in a regular
1413      * ANYOF node, with the first 256 code points in a bit map */
1414
1415     SV* invlist = invlist_clone(ssc->invlist);
1416
1417     PERL_ARGS_ASSERT_SSC_FINALIZE;
1418
1419     assert(OP(ssc) == ANYOF_SYNTHETIC);
1420
1421     /* The code in this file assumes that all but these flags aren't relevant
1422      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1423      * time we reach here */
1424     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1425
1426     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1427
1428     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1429
1430     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1431 }
1432
1433 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1434 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1435 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1436 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1437
1438
1439 #ifdef DEBUGGING
1440 /*
1441    dump_trie(trie,widecharmap,revcharmap)
1442    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1443    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1444
1445    These routines dump out a trie in a somewhat readable format.
1446    The _interim_ variants are used for debugging the interim
1447    tables that are used to generate the final compressed
1448    representation which is what dump_trie expects.
1449
1450    Part of the reason for their existence is to provide a form
1451    of documentation as to how the different representations function.
1452
1453 */
1454
1455 /*
1456   Dumps the final compressed table form of the trie to Perl_debug_log.
1457   Used for debugging make_trie().
1458 */
1459
1460 STATIC void
1461 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1462             AV *revcharmap, U32 depth)
1463 {
1464     U32 state;
1465     SV *sv=sv_newmortal();
1466     int colwidth= widecharmap ? 6 : 4;
1467     U16 word;
1468     GET_RE_DEBUG_FLAGS_DECL;
1469
1470     PERL_ARGS_ASSERT_DUMP_TRIE;
1471
1472     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1473         (int)depth * 2 + 2,"",
1474         "Match","Base","Ofs" );
1475
1476     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1477         SV ** const tmp = av_fetch( revcharmap, state, 0);
1478         if ( tmp ) {
1479             PerlIO_printf( Perl_debug_log, "%*s",
1480                 colwidth,
1481                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1482                             PL_colors[0], PL_colors[1],
1483                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1484                             PERL_PV_ESCAPE_FIRSTCHAR
1485                 )
1486             );
1487         }
1488     }
1489     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1490         (int)depth * 2 + 2,"");
1491
1492     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1493         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1494     PerlIO_printf( Perl_debug_log, "\n");
1495
1496     for( state = 1 ; state < trie->statecount ; state++ ) {
1497         const U32 base = trie->states[ state ].trans.base;
1498
1499         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1500
1501         if ( trie->states[ state ].wordnum ) {
1502             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1503         } else {
1504             PerlIO_printf( Perl_debug_log, "%6s", "" );
1505         }
1506
1507         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1508
1509         if ( base ) {
1510             U32 ofs = 0;
1511
1512             while( ( base + ofs  < trie->uniquecharcount ) ||
1513                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1514                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1515                     ofs++;
1516
1517             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1518
1519             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1520                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1521                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1522                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1523                 {
1524                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1525                     colwidth,
1526                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1527                 } else {
1528                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1529                 }
1530             }
1531
1532             PerlIO_printf( Perl_debug_log, "]");
1533
1534         }
1535         PerlIO_printf( Perl_debug_log, "\n" );
1536     }
1537     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1538     for (word=1; word <= trie->wordcount; word++) {
1539         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1540             (int)word, (int)(trie->wordinfo[word].prev),
1541             (int)(trie->wordinfo[word].len));
1542     }
1543     PerlIO_printf(Perl_debug_log, "\n" );
1544 }
1545 /*
1546   Dumps a fully constructed but uncompressed trie in list form.
1547   List tries normally only are used for construction when the number of
1548   possible chars (trie->uniquecharcount) is very high.
1549   Used for debugging make_trie().
1550 */
1551 STATIC void
1552 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1553                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1554                          U32 depth)
1555 {
1556     U32 state;
1557     SV *sv=sv_newmortal();
1558     int colwidth= widecharmap ? 6 : 4;
1559     GET_RE_DEBUG_FLAGS_DECL;
1560
1561     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1562
1563     /* print out the table precompression.  */
1564     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1565         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1566         "------:-----+-----------------\n" );
1567
1568     for( state=1 ; state < next_alloc ; state ++ ) {
1569         U16 charid;
1570
1571         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1572             (int)depth * 2 + 2,"", (UV)state  );
1573         if ( ! trie->states[ state ].wordnum ) {
1574             PerlIO_printf( Perl_debug_log, "%5s| ","");
1575         } else {
1576             PerlIO_printf( Perl_debug_log, "W%4x| ",
1577                 trie->states[ state ].wordnum
1578             );
1579         }
1580         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1581             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1582             if ( tmp ) {
1583                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1584                     colwidth,
1585                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1586                             PL_colors[0], PL_colors[1],
1587                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1588                             PERL_PV_ESCAPE_FIRSTCHAR
1589                     ) ,
1590                     TRIE_LIST_ITEM(state,charid).forid,
1591                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1592                 );
1593                 if (!(charid % 10))
1594                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1595                         (int)((depth * 2) + 14), "");
1596             }
1597         }
1598         PerlIO_printf( Perl_debug_log, "\n");
1599     }
1600 }
1601
1602 /*
1603   Dumps a fully constructed but uncompressed trie in table form.
1604   This is the normal DFA style state transition table, with a few
1605   twists to facilitate compression later.
1606   Used for debugging make_trie().
1607 */
1608 STATIC void
1609 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1610                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1611                           U32 depth)
1612 {
1613     U32 state;
1614     U16 charid;
1615     SV *sv=sv_newmortal();
1616     int colwidth= widecharmap ? 6 : 4;
1617     GET_RE_DEBUG_FLAGS_DECL;
1618
1619     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1620
1621     /*
1622        print out the table precompression so that we can do a visual check
1623        that they are identical.
1624      */
1625
1626     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1627
1628     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1629         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1630         if ( tmp ) {
1631             PerlIO_printf( Perl_debug_log, "%*s",
1632                 colwidth,
1633                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1634                             PL_colors[0], PL_colors[1],
1635                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1636                             PERL_PV_ESCAPE_FIRSTCHAR
1637                 )
1638             );
1639         }
1640     }
1641
1642     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1643
1644     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1645         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1646     }
1647
1648     PerlIO_printf( Perl_debug_log, "\n" );
1649
1650     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1651
1652         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1653             (int)depth * 2 + 2,"",
1654             (UV)TRIE_NODENUM( state ) );
1655
1656         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1657             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1658             if (v)
1659                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1660             else
1661                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1662         }
1663         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1664             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1665         } else {
1666             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1667             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1668         }
1669     }
1670 }
1671
1672 #endif
1673
1674
1675 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1676   startbranch: the first branch in the whole branch sequence
1677   first      : start branch of sequence of branch-exact nodes.
1678                May be the same as startbranch
1679   last       : Thing following the last branch.
1680                May be the same as tail.
1681   tail       : item following the branch sequence
1682   count      : words in the sequence
1683   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1684   depth      : indent depth
1685
1686 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1687
1688 A trie is an N'ary tree where the branches are determined by digital
1689 decomposition of the key. IE, at the root node you look up the 1st character and
1690 follow that branch repeat until you find the end of the branches. Nodes can be
1691 marked as "accepting" meaning they represent a complete word. Eg:
1692
1693   /he|she|his|hers/
1694
1695 would convert into the following structure. Numbers represent states, letters
1696 following numbers represent valid transitions on the letter from that state, if
1697 the number is in square brackets it represents an accepting state, otherwise it
1698 will be in parenthesis.
1699
1700       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1701       |    |
1702       |   (2)
1703       |    |
1704      (1)   +-i->(6)-+-s->[7]
1705       |
1706       +-s->(3)-+-h->(4)-+-e->[5]
1707
1708       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1709
1710 This shows that when matching against the string 'hers' we will begin at state 1
1711 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1712 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1713 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1714 single traverse. We store a mapping from accepting to state to which word was
1715 matched, and then when we have multiple possibilities we try to complete the
1716 rest of the regex in the order in which they occured in the alternation.
1717
1718 The only prior NFA like behaviour that would be changed by the TRIE support is
1719 the silent ignoring of duplicate alternations which are of the form:
1720
1721  / (DUPE|DUPE) X? (?{ ... }) Y /x
1722
1723 Thus EVAL blocks following a trie may be called a different number of times with
1724 and without the optimisation. With the optimisations dupes will be silently
1725 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1726 the following demonstrates:
1727
1728  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1729
1730 which prints out 'word' three times, but
1731
1732  'words'=~/(word|word|word)(?{ print $1 })S/
1733
1734 which doesnt print it out at all. This is due to other optimisations kicking in.
1735
1736 Example of what happens on a structural level:
1737
1738 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1739
1740    1: CURLYM[1] {1,32767}(18)
1741    5:   BRANCH(8)
1742    6:     EXACT <ac>(16)
1743    8:   BRANCH(11)
1744    9:     EXACT <ad>(16)
1745   11:   BRANCH(14)
1746   12:     EXACT <ab>(16)
1747   16:   SUCCEED(0)
1748   17:   NOTHING(18)
1749   18: END(0)
1750
1751 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1752 and should turn into:
1753
1754    1: CURLYM[1] {1,32767}(18)
1755    5:   TRIE(16)
1756         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1757           <ac>
1758           <ad>
1759           <ab>
1760   16:   SUCCEED(0)
1761   17:   NOTHING(18)
1762   18: END(0)
1763
1764 Cases where tail != last would be like /(?foo|bar)baz/:
1765
1766    1: BRANCH(4)
1767    2:   EXACT <foo>(8)
1768    4: BRANCH(7)
1769    5:   EXACT <bar>(8)
1770    7: TAIL(8)
1771    8: EXACT <baz>(10)
1772   10: END(0)
1773
1774 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1775 and would end up looking like:
1776
1777     1: TRIE(8)
1778       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1779         <foo>
1780         <bar>
1781    7: TAIL(8)
1782    8: EXACT <baz>(10)
1783   10: END(0)
1784
1785     d = uvchr_to_utf8_flags(d, uv, 0);
1786
1787 is the recommended Unicode-aware way of saying
1788
1789     *(d++) = uv;
1790 */
1791
1792 #define TRIE_STORE_REVCHAR(val)                                            \
1793     STMT_START {                                                           \
1794         if (UTF) {                                                         \
1795             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1796             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1797             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1798             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1799             SvPOK_on(zlopp);                                               \
1800             SvUTF8_on(zlopp);                                              \
1801             av_push(revcharmap, zlopp);                                    \
1802         } else {                                                           \
1803             char ooooff = (char)val;                                           \
1804             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1805         }                                                                  \
1806         } STMT_END
1807
1808 /* This gets the next character from the input, folding it if not already
1809  * folded. */
1810 #define TRIE_READ_CHAR STMT_START {                                           \
1811     wordlen++;                                                                \
1812     if ( UTF ) {                                                              \
1813         /* if it is UTF then it is either already folded, or does not need    \
1814          * folding */                                                         \
1815         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1816     }                                                                         \
1817     else if (folder == PL_fold_latin1) {                                      \
1818         /* This folder implies Unicode rules, which in the range expressible  \
1819          *  by not UTF is the lower case, with the two exceptions, one of     \
1820          *  which should have been taken care of before calling this */       \
1821         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1822         uvc = toLOWER_L1(*uc);                                                \
1823         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1824         len = 1;                                                              \
1825     } else {                                                                  \
1826         /* raw data, will be folded later if needed */                        \
1827         uvc = (U32)*uc;                                                       \
1828         len = 1;                                                              \
1829     }                                                                         \
1830 } STMT_END
1831
1832
1833
1834 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1835     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1836         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1837         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1838     }                                                           \
1839     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1840     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1841     TRIE_LIST_CUR( state )++;                                   \
1842 } STMT_END
1843
1844 #define TRIE_LIST_NEW(state) STMT_START {                       \
1845     Newxz( trie->states[ state ].trans.list,               \
1846         4, reg_trie_trans_le );                                 \
1847      TRIE_LIST_CUR( state ) = 1;                                \
1848      TRIE_LIST_LEN( state ) = 4;                                \
1849 } STMT_END
1850
1851 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1852     U16 dupe= trie->states[ state ].wordnum;                    \
1853     regnode * const noper_next = regnext( noper );              \
1854                                                                 \
1855     DEBUG_r({                                                   \
1856         /* store the word for dumping */                        \
1857         SV* tmp;                                                \
1858         if (OP(noper) != NOTHING)                               \
1859             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1860         else                                                    \
1861             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1862         av_push( trie_words, tmp );                             \
1863     });                                                         \
1864                                                                 \
1865     curword++;                                                  \
1866     trie->wordinfo[curword].prev   = 0;                         \
1867     trie->wordinfo[curword].len    = wordlen;                   \
1868     trie->wordinfo[curword].accept = state;                     \
1869                                                                 \
1870     if ( noper_next < tail ) {                                  \
1871         if (!trie->jump)                                        \
1872             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1873         trie->jump[curword] = (U16)(noper_next - convert);      \
1874         if (!jumper)                                            \
1875             jumper = noper_next;                                \
1876         if (!nextbranch)                                        \
1877             nextbranch= regnext(cur);                           \
1878     }                                                           \
1879                                                                 \
1880     if ( dupe ) {                                               \
1881         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1882         /* chain, so that when the bits of chain are later    */\
1883         /* linked together, the dups appear in the chain      */\
1884         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1885         trie->wordinfo[dupe].prev = curword;                    \
1886     } else {                                                    \
1887         /* we haven't inserted this word yet.                */ \
1888         trie->states[ state ].wordnum = curword;                \
1889     }                                                           \
1890 } STMT_END
1891
1892
1893 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1894      ( ( base + charid >=  ucharcount                                   \
1895          && base + charid < ubound                                      \
1896          && state == trie->trans[ base - ucharcount + charid ].check    \
1897          && trie->trans[ base - ucharcount + charid ].next )            \
1898            ? trie->trans[ base - ucharcount + charid ].next             \
1899            : ( state==1 ? special : 0 )                                 \
1900       )
1901
1902 #define MADE_TRIE       1
1903 #define MADE_JUMP_TRIE  2
1904 #define MADE_EXACT_TRIE 4
1905
1906 STATIC I32
1907 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1908 {
1909     dVAR;
1910     /* first pass, loop through and scan words */
1911     reg_trie_data *trie;
1912     HV *widecharmap = NULL;
1913     AV *revcharmap = newAV();
1914     regnode *cur;
1915     STRLEN len = 0;
1916     UV uvc = 0;
1917     U16 curword = 0;
1918     U32 next_alloc = 0;
1919     regnode *jumper = NULL;
1920     regnode *nextbranch = NULL;
1921     regnode *convert = NULL;
1922     U32 *prev_states; /* temp array mapping each state to previous one */
1923     /* we just use folder as a flag in utf8 */
1924     const U8 * folder = NULL;
1925
1926 #ifdef DEBUGGING
1927     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1928     AV *trie_words = NULL;
1929     /* along with revcharmap, this only used during construction but both are
1930      * useful during debugging so we store them in the struct when debugging.
1931      */
1932 #else
1933     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1934     STRLEN trie_charcount=0;
1935 #endif
1936     SV *re_trie_maxbuff;
1937     GET_RE_DEBUG_FLAGS_DECL;
1938
1939     PERL_ARGS_ASSERT_MAKE_TRIE;
1940 #ifndef DEBUGGING
1941     PERL_UNUSED_ARG(depth);
1942 #endif
1943
1944     switch (flags) {
1945         case EXACT: break;
1946         case EXACTFA:
1947         case EXACTFU_SS:
1948         case EXACTFU: folder = PL_fold_latin1; break;
1949         case EXACTF:  folder = PL_fold; break;
1950         case EXACTFL: folder = PL_fold_locale; break;
1951         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1952     }
1953
1954     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1955     trie->refcount = 1;
1956     trie->startstate = 1;
1957     trie->wordcount = word_count;
1958     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1959     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1960     if (flags == EXACT)
1961         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1962     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1963                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1964
1965     DEBUG_r({
1966         trie_words = newAV();
1967     });
1968
1969     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1970     if (!SvIOK(re_trie_maxbuff)) {
1971         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1972     }
1973     DEBUG_TRIE_COMPILE_r({
1974                 PerlIO_printf( Perl_debug_log,
1975                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1976                   (int)depth * 2 + 2, "",
1977                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1978                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1979                   (int)depth);
1980     });
1981
1982    /* Find the node we are going to overwrite */
1983     if ( first == startbranch && OP( last ) != BRANCH ) {
1984         /* whole branch chain */
1985         convert = first;
1986     } else {
1987         /* branch sub-chain */
1988         convert = NEXTOPER( first );
1989     }
1990
1991     /*  -- First loop and Setup --
1992
1993        We first traverse the branches and scan each word to determine if it
1994        contains widechars, and how many unique chars there are, this is
1995        important as we have to build a table with at least as many columns as we
1996        have unique chars.
1997
1998        We use an array of integers to represent the character codes 0..255
1999        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
2000        native representation of the character value as the key and IV's for the
2001        coded index.
2002
2003        *TODO* If we keep track of how many times each character is used we can
2004        remap the columns so that the table compression later on is more
2005        efficient in terms of memory by ensuring the most common value is in the
2006        middle and the least common are on the outside.  IMO this would be better
2007        than a most to least common mapping as theres a decent chance the most
2008        common letter will share a node with the least common, meaning the node
2009        will not be compressible. With a middle is most common approach the worst
2010        case is when we have the least common nodes twice.
2011
2012      */
2013
2014     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2015         regnode *noper = NEXTOPER( cur );
2016         const U8 *uc = (U8*)STRING( noper );
2017         const U8 *e  = uc + STR_LEN( noper );
2018         STRLEN foldlen = 0;
2019         U32 wordlen      = 0;         /* required init */
2020         STRLEN minbytes = 0;
2021         STRLEN maxbytes = 0;
2022         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
2023
2024         if (OP(noper) == NOTHING) {
2025             regnode *noper_next= regnext(noper);
2026             if (noper_next != tail && OP(noper_next) == flags) {
2027                 noper = noper_next;
2028                 uc= (U8*)STRING(noper);
2029                 e= uc + STR_LEN(noper);
2030                 trie->minlen= STR_LEN(noper);
2031             } else {
2032                 trie->minlen= 0;
2033                 continue;
2034             }
2035         }
2036
2037         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2038             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2039                                           regardless of encoding */
2040             if (OP( noper ) == EXACTFU_SS) {
2041                 /* false positives are ok, so just set this */
2042                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2043             }
2044         }
2045         for ( ; uc < e ; uc += len ) {
2046             TRIE_CHARCOUNT(trie)++;
2047             TRIE_READ_CHAR;
2048
2049             /* Acummulate to the current values, the range in the number of
2050              * bytes that this character could match.  The max is presumed to
2051              * be the same as the folded input (which TRIE_READ_CHAR returns),
2052              * except that when this is not in UTF-8, it could be matched
2053              * against a string which is UTF-8, and the variant characters
2054              * could be 2 bytes instead of the 1 here.  Likewise, for the
2055              * minimum number of bytes when not folded.  When folding, the min
2056              * is assumed to be 1 byte could fold to match the single character
2057              * here, or in the case of a multi-char fold, 1 byte can fold to
2058              * the whole sequence.  'foldlen' is used to denote whether we are
2059              * in such a sequence, skipping the min setting if so.  XXX TODO
2060              * Use the exact list of what folds to each character, from
2061              * PL_utf8_foldclosures */
2062             if (UTF) {
2063                 maxbytes += UTF8SKIP(uc);
2064                 if (! folder) {
2065                     /* A non-UTF-8 string could be 1 byte to match our 2 */
2066                     minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2067                                 ? 1
2068                                 : UTF8SKIP(uc);
2069                 }
2070                 else {
2071                     if (foldlen) {
2072                         foldlen -= UTF8SKIP(uc);
2073                     }
2074                     else {
2075                         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2076                         minbytes++;
2077                     }
2078                 }
2079             }
2080             else {
2081                 maxbytes += (UNI_IS_INVARIANT(*uc))
2082                              ? 1
2083                              : 2;
2084                 if (! folder) {
2085                     minbytes++;
2086                 }
2087                 else {
2088                     if (foldlen) {
2089                         foldlen--;
2090                     }
2091                     else {
2092                         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2093                         minbytes++;
2094                     }
2095                 }
2096             }
2097             if ( uvc < 256 ) {
2098                 if ( folder ) {
2099                     U8 folded= folder[ (U8) uvc ];
2100                     if ( !trie->charmap[ folded ] ) {
2101                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2102                         TRIE_STORE_REVCHAR( folded );
2103                     }
2104                 }
2105                 if ( !trie->charmap[ uvc ] ) {
2106                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2107                     TRIE_STORE_REVCHAR( uvc );
2108                 }
2109                 if ( set_bit ) {
2110                     /* store the codepoint in the bitmap, and its folded
2111                      * equivalent. */
2112                     TRIE_BITMAP_SET(trie, uvc);
2113
2114                     /* store the folded codepoint */
2115                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2116
2117                     if ( !UTF ) {
2118                         /* store first byte of utf8 representation of
2119                            variant codepoints */
2120                         if (! UVCHR_IS_INVARIANT(uvc)) {
2121                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2122                         }
2123                     }
2124                     set_bit = 0; /* We've done our bit :-) */
2125                 }
2126             } else {
2127                 SV** svpp;
2128                 if ( !widecharmap )
2129                     widecharmap = newHV();
2130
2131                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2132
2133                 if ( !svpp )
2134                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2135
2136                 if ( !SvTRUE( *svpp ) ) {
2137                     sv_setiv( *svpp, ++trie->uniquecharcount );
2138                     TRIE_STORE_REVCHAR(uvc);
2139                 }
2140             }
2141         }
2142         if( cur == first ) {
2143             trie->minlen = minbytes;
2144             trie->maxlen = maxbytes;
2145         } else if (minbytes < trie->minlen) {
2146             trie->minlen = minbytes;
2147         } else if (maxbytes > trie->maxlen) {
2148             trie->maxlen = maxbytes;
2149         }
2150     } /* end first pass */
2151     DEBUG_TRIE_COMPILE_r(
2152         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2153                 (int)depth * 2 + 2,"",
2154                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2155                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2156                 (int)trie->minlen, (int)trie->maxlen )
2157     );
2158
2159     /*
2160         We now know what we are dealing with in terms of unique chars and
2161         string sizes so we can calculate how much memory a naive
2162         representation using a flat table  will take. If it's over a reasonable
2163         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2164         conservative but potentially much slower representation using an array
2165         of lists.
2166
2167         At the end we convert both representations into the same compressed
2168         form that will be used in regexec.c for matching with. The latter
2169         is a form that cannot be used to construct with but has memory
2170         properties similar to the list form and access properties similar
2171         to the table form making it both suitable for fast searches and
2172         small enough that its feasable to store for the duration of a program.
2173
2174         See the comment in the code where the compressed table is produced
2175         inplace from the flat tabe representation for an explanation of how
2176         the compression works.
2177
2178     */
2179
2180
2181     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2182     prev_states[1] = 0;
2183
2184     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2185         /*
2186             Second Pass -- Array Of Lists Representation
2187
2188             Each state will be represented by a list of charid:state records
2189             (reg_trie_trans_le) the first such element holds the CUR and LEN
2190             points of the allocated array. (See defines above).
2191
2192             We build the initial structure using the lists, and then convert
2193             it into the compressed table form which allows faster lookups
2194             (but cant be modified once converted).
2195         */
2196
2197         STRLEN transcount = 1;
2198
2199         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2200             "%*sCompiling trie using list compiler\n",
2201             (int)depth * 2 + 2, ""));
2202
2203         trie->states = (reg_trie_state *)
2204             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2205                                   sizeof(reg_trie_state) );
2206         TRIE_LIST_NEW(1);
2207         next_alloc = 2;
2208
2209         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2210
2211             regnode *noper   = NEXTOPER( cur );
2212             U8 *uc           = (U8*)STRING( noper );
2213             const U8 *e      = uc + STR_LEN( noper );
2214             U32 state        = 1;         /* required init */
2215             U16 charid       = 0;         /* sanity init */
2216             U32 wordlen      = 0;         /* required init */
2217
2218             if (OP(noper) == NOTHING) {
2219                 regnode *noper_next= regnext(noper);
2220                 if (noper_next != tail && OP(noper_next) == flags) {
2221                     noper = noper_next;
2222                     uc= (U8*)STRING(noper);
2223                     e= uc + STR_LEN(noper);
2224                 }
2225             }
2226
2227             if (OP(noper) != NOTHING) {
2228                 for ( ; uc < e ; uc += len ) {
2229
2230                     TRIE_READ_CHAR;
2231
2232                     if ( uvc < 256 ) {
2233                         charid = trie->charmap[ uvc ];
2234                     } else {
2235                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2236                         if ( !svpp ) {
2237                             charid = 0;
2238                         } else {
2239                             charid=(U16)SvIV( *svpp );
2240                         }
2241                     }
2242                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2243                     if ( charid ) {
2244
2245                         U16 check;
2246                         U32 newstate = 0;
2247
2248                         charid--;
2249                         if ( !trie->states[ state ].trans.list ) {
2250                             TRIE_LIST_NEW( state );
2251                         }
2252                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2253                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2254                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2255                                 break;
2256                             }
2257                         }
2258                         if ( ! newstate ) {
2259                             newstate = next_alloc++;
2260                             prev_states[newstate] = state;
2261                             TRIE_LIST_PUSH( state, charid, newstate );
2262                             transcount++;
2263                         }
2264                         state = newstate;
2265                     } else {
2266                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2267                     }
2268                 }
2269             }
2270             TRIE_HANDLE_WORD(state);
2271
2272         } /* end second pass */
2273
2274         /* next alloc is the NEXT state to be allocated */
2275         trie->statecount = next_alloc;
2276         trie->states = (reg_trie_state *)
2277             PerlMemShared_realloc( trie->states,
2278                                    next_alloc
2279                                    * sizeof(reg_trie_state) );
2280
2281         /* and now dump it out before we compress it */
2282         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2283                                                          revcharmap, next_alloc,
2284                                                          depth+1)
2285         );
2286
2287         trie->trans = (reg_trie_trans *)
2288             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2289         {
2290             U32 state;
2291             U32 tp = 0;
2292             U32 zp = 0;
2293
2294
2295             for( state=1 ; state < next_alloc ; state ++ ) {
2296                 U32 base=0;
2297
2298                 /*
2299                 DEBUG_TRIE_COMPILE_MORE_r(
2300                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2301                 );
2302                 */
2303
2304                 if (trie->states[state].trans.list) {
2305                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2306                     U16 maxid=minid;
2307                     U16 idx;
2308
2309                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2310                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2311                         if ( forid < minid ) {
2312                             minid=forid;
2313                         } else if ( forid > maxid ) {
2314                             maxid=forid;
2315                         }
2316                     }
2317                     if ( transcount < tp + maxid - minid + 1) {
2318                         transcount *= 2;
2319                         trie->trans = (reg_trie_trans *)
2320                             PerlMemShared_realloc( trie->trans,
2321                                                      transcount
2322                                                      * sizeof(reg_trie_trans) );
2323                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2324                     }
2325                     base = trie->uniquecharcount + tp - minid;
2326                     if ( maxid == minid ) {
2327                         U32 set = 0;
2328                         for ( ; zp < tp ; zp++ ) {
2329                             if ( ! trie->trans[ zp ].next ) {
2330                                 base = trie->uniquecharcount + zp - minid;
2331                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2332                                 trie->trans[ zp ].check = state;
2333                                 set = 1;
2334                                 break;
2335                             }
2336                         }
2337                         if ( !set ) {
2338                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2339                             trie->trans[ tp ].check = state;
2340                             tp++;
2341                             zp = tp;
2342                         }
2343                     } else {
2344                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2345                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2346                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2347                             trie->trans[ tid ].check = state;
2348                         }
2349                         tp += ( maxid - minid + 1 );
2350                     }
2351                     Safefree(trie->states[ state ].trans.list);
2352                 }
2353                 /*
2354                 DEBUG_TRIE_COMPILE_MORE_r(
2355                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2356                 );
2357                 */
2358                 trie->states[ state ].trans.base=base;
2359             }
2360             trie->lasttrans = tp + 1;
2361         }
2362     } else {
2363         /*
2364            Second Pass -- Flat Table Representation.
2365
2366            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2367            each.  We know that we will need Charcount+1 trans at most to store
2368            the data (one row per char at worst case) So we preallocate both
2369            structures assuming worst case.
2370
2371            We then construct the trie using only the .next slots of the entry
2372            structs.
2373
2374            We use the .check field of the first entry of the node temporarily
2375            to make compression both faster and easier by keeping track of how
2376            many non zero fields are in the node.
2377
2378            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2379            transition.
2380
2381            There are two terms at use here: state as a TRIE_NODEIDX() which is
2382            a number representing the first entry of the node, and state as a
2383            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2384            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2385            if there are 2 entrys per node. eg:
2386
2387              A B       A B
2388           1. 2 4    1. 3 7
2389           2. 0 3    3. 0 5
2390           3. 0 0    5. 0 0
2391           4. 0 0    7. 0 0
2392
2393            The table is internally in the right hand, idx form. However as we
2394            also have to deal with the states array which is indexed by nodenum
2395            we have to use TRIE_NODENUM() to convert.
2396
2397         */
2398         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2399             "%*sCompiling trie using table compiler\n",
2400             (int)depth * 2 + 2, ""));
2401
2402         trie->trans = (reg_trie_trans *)
2403             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2404                                   * trie->uniquecharcount + 1,
2405                                   sizeof(reg_trie_trans) );
2406         trie->states = (reg_trie_state *)
2407             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2408                                   sizeof(reg_trie_state) );
2409         next_alloc = trie->uniquecharcount + 1;
2410
2411
2412         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2413
2414             regnode *noper   = NEXTOPER( cur );
2415             const U8 *uc     = (U8*)STRING( noper );
2416             const U8 *e      = uc + STR_LEN( noper );
2417
2418             U32 state        = 1;         /* required init */
2419
2420             U16 charid       = 0;         /* sanity init */
2421             U32 accept_state = 0;         /* sanity init */
2422
2423             U32 wordlen      = 0;         /* required init */
2424
2425             if (OP(noper) == NOTHING) {
2426                 regnode *noper_next= regnext(noper);
2427                 if (noper_next != tail && OP(noper_next) == flags) {
2428                     noper = noper_next;
2429                     uc= (U8*)STRING(noper);
2430                     e= uc + STR_LEN(noper);
2431                 }
2432             }
2433
2434             if ( OP(noper) != NOTHING ) {
2435                 for ( ; uc < e ; uc += len ) {
2436
2437                     TRIE_READ_CHAR;
2438
2439                     if ( uvc < 256 ) {
2440                         charid = trie->charmap[ uvc ];
2441                     } else {
2442                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2443                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2444                     }
2445                     if ( charid ) {
2446                         charid--;
2447                         if ( !trie->trans[ state + charid ].next ) {
2448                             trie->trans[ state + charid ].next = next_alloc;
2449                             trie->trans[ state ].check++;
2450                             prev_states[TRIE_NODENUM(next_alloc)]
2451                                     = TRIE_NODENUM(state);
2452                             next_alloc += trie->uniquecharcount;
2453                         }
2454                         state = trie->trans[ state + charid ].next;
2455                     } else {
2456                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2457                     }
2458                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2459                 }
2460             }
2461             accept_state = TRIE_NODENUM( state );
2462             TRIE_HANDLE_WORD(accept_state);
2463
2464         } /* end second pass */
2465
2466         /* and now dump it out before we compress it */
2467         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2468                                                           revcharmap,
2469                                                           next_alloc, depth+1));
2470
2471         {
2472         /*
2473            * Inplace compress the table.*
2474
2475            For sparse data sets the table constructed by the trie algorithm will
2476            be mostly 0/FAIL transitions or to put it another way mostly empty.
2477            (Note that leaf nodes will not contain any transitions.)
2478
2479            This algorithm compresses the tables by eliminating most such
2480            transitions, at the cost of a modest bit of extra work during lookup:
2481
2482            - Each states[] entry contains a .base field which indicates the
2483            index in the state[] array wheres its transition data is stored.
2484
2485            - If .base is 0 there are no valid transitions from that node.
2486
2487            - If .base is nonzero then charid is added to it to find an entry in
2488            the trans array.
2489
2490            -If trans[states[state].base+charid].check!=state then the
2491            transition is taken to be a 0/Fail transition. Thus if there are fail
2492            transitions at the front of the node then the .base offset will point
2493            somewhere inside the previous nodes data (or maybe even into a node
2494            even earlier), but the .check field determines if the transition is
2495            valid.
2496
2497            XXX - wrong maybe?
2498            The following process inplace converts the table to the compressed
2499            table: We first do not compress the root node 1,and mark all its
2500            .check pointers as 1 and set its .base pointer as 1 as well. This
2501            allows us to do a DFA construction from the compressed table later,
2502            and ensures that any .base pointers we calculate later are greater
2503            than 0.
2504
2505            - We set 'pos' to indicate the first entry of the second node.
2506
2507            - We then iterate over the columns of the node, finding the first and
2508            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2509            and set the .check pointers accordingly, and advance pos
2510            appropriately and repreat for the next node. Note that when we copy
2511            the next pointers we have to convert them from the original
2512            NODEIDX form to NODENUM form as the former is not valid post
2513            compression.
2514
2515            - If a node has no transitions used we mark its base as 0 and do not
2516            advance the pos pointer.
2517
2518            - If a node only has one transition we use a second pointer into the
2519            structure to fill in allocated fail transitions from other states.
2520            This pointer is independent of the main pointer and scans forward
2521            looking for null transitions that are allocated to a state. When it
2522            finds one it writes the single transition into the "hole".  If the
2523            pointer doesnt find one the single transition is appended as normal.
2524
2525            - Once compressed we can Renew/realloc the structures to release the
2526            excess space.
2527
2528            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2529            specifically Fig 3.47 and the associated pseudocode.
2530
2531            demq
2532         */
2533         const U32 laststate = TRIE_NODENUM( next_alloc );
2534         U32 state, charid;
2535         U32 pos = 0, zp=0;
2536         trie->statecount = laststate;
2537
2538         for ( state = 1 ; state < laststate ; state++ ) {
2539             U8 flag = 0;
2540             const U32 stateidx = TRIE_NODEIDX( state );
2541             const U32 o_used = trie->trans[ stateidx ].check;
2542             U32 used = trie->trans[ stateidx ].check;
2543             trie->trans[ stateidx ].check = 0;
2544
2545             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2546                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2547                     if ( trie->trans[ stateidx + charid ].next ) {
2548                         if (o_used == 1) {
2549                             for ( ; zp < pos ; zp++ ) {
2550                                 if ( ! trie->trans[ zp ].next ) {
2551                                     break;
2552                                 }
2553                             }
2554                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2555                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2556                             trie->trans[ zp ].check = state;
2557                             if ( ++zp > pos ) pos = zp;
2558                             break;
2559                         }
2560                         used--;
2561                     }
2562                     if ( !flag ) {
2563                         flag = 1;
2564                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2565                     }
2566                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2567                     trie->trans[ pos ].check = state;
2568                     pos++;
2569                 }
2570             }
2571         }
2572         trie->lasttrans = pos + 1;
2573         trie->states = (reg_trie_state *)
2574             PerlMemShared_realloc( trie->states, laststate
2575                                    * sizeof(reg_trie_state) );
2576         DEBUG_TRIE_COMPILE_MORE_r(
2577                 PerlIO_printf( Perl_debug_log,
2578                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2579                     (int)depth * 2 + 2,"",
2580                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2581                     (IV)next_alloc,
2582                     (IV)pos,
2583                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2584             );
2585
2586         } /* end table compress */
2587     }
2588     DEBUG_TRIE_COMPILE_MORE_r(
2589             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2590                 (int)depth * 2 + 2, "",
2591                 (UV)trie->statecount,
2592                 (UV)trie->lasttrans)
2593     );
2594     /* resize the trans array to remove unused space */
2595     trie->trans = (reg_trie_trans *)
2596         PerlMemShared_realloc( trie->trans, trie->lasttrans
2597                                * sizeof(reg_trie_trans) );
2598
2599     {   /* Modify the program and insert the new TRIE node */
2600         U8 nodetype =(U8)(flags & 0xFF);
2601         char *str=NULL;
2602
2603 #ifdef DEBUGGING
2604         regnode *optimize = NULL;
2605 #ifdef RE_TRACK_PATTERN_OFFSETS
2606
2607         U32 mjd_offset = 0;
2608         U32 mjd_nodelen = 0;
2609 #endif /* RE_TRACK_PATTERN_OFFSETS */
2610 #endif /* DEBUGGING */
2611         /*
2612            This means we convert either the first branch or the first Exact,
2613            depending on whether the thing following (in 'last') is a branch
2614            or not and whther first is the startbranch (ie is it a sub part of
2615            the alternation or is it the whole thing.)
2616            Assuming its a sub part we convert the EXACT otherwise we convert
2617            the whole branch sequence, including the first.
2618          */
2619         /* Find the node we are going to overwrite */
2620         if ( first != startbranch || OP( last ) == BRANCH ) {
2621             /* branch sub-chain */
2622             NEXT_OFF( first ) = (U16)(last - first);
2623 #ifdef RE_TRACK_PATTERN_OFFSETS
2624             DEBUG_r({
2625                 mjd_offset= Node_Offset((convert));
2626                 mjd_nodelen= Node_Length((convert));
2627             });
2628 #endif
2629             /* whole branch chain */
2630         }
2631 #ifdef RE_TRACK_PATTERN_OFFSETS
2632         else {
2633             DEBUG_r({
2634                 const  regnode *nop = NEXTOPER( convert );
2635                 mjd_offset= Node_Offset((nop));
2636                 mjd_nodelen= Node_Length((nop));
2637             });
2638         }
2639         DEBUG_OPTIMISE_r(
2640             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2641                 (int)depth * 2 + 2, "",
2642                 (UV)mjd_offset, (UV)mjd_nodelen)
2643         );
2644 #endif
2645         /* But first we check to see if there is a common prefix we can
2646            split out as an EXACT and put in front of the TRIE node.  */
2647         trie->startstate= 1;
2648         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2649             U32 state;
2650             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2651                 U32 ofs = 0;
2652                 I32 idx = -1;
2653                 U32 count = 0;
2654                 const U32 base = trie->states[ state ].trans.base;
2655
2656                 if ( trie->states[state].wordnum )
2657                         count = 1;
2658
2659                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2660                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2661                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2662                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2663                     {
2664                         if ( ++count > 1 ) {
2665                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2666                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2667                             if ( state == 1 ) break;
2668                             if ( count == 2 ) {
2669                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2670                                 DEBUG_OPTIMISE_r(
2671                                     PerlIO_printf(Perl_debug_log,
2672                                         "%*sNew Start State=%"UVuf" Class: [",
2673                                         (int)depth * 2 + 2, "",
2674                                         (UV)state));
2675                                 if (idx >= 0) {
2676                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2677                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2678
2679                                     TRIE_BITMAP_SET(trie,*ch);
2680                                     if ( folder )
2681                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2682                                     DEBUG_OPTIMISE_r(
2683                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2684                                     );
2685                                 }
2686                             }
2687                             TRIE_BITMAP_SET(trie,*ch);
2688                             if ( folder )
2689                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2690                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2691                         }
2692                         idx = ofs;
2693                     }
2694                 }
2695                 if ( count == 1 ) {
2696                     SV **tmp = av_fetch( revcharmap, idx, 0);
2697                     STRLEN len;
2698                     char *ch = SvPV( *tmp, len );
2699                     DEBUG_OPTIMISE_r({
2700                         SV *sv=sv_newmortal();
2701                         PerlIO_printf( Perl_debug_log,
2702                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2703                             (int)depth * 2 + 2, "",
2704                             (UV)state, (UV)idx,
2705                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2706                                 PL_colors[0], PL_colors[1],
2707                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2708                                 PERL_PV_ESCAPE_FIRSTCHAR
2709                             )
2710                         );
2711                     });
2712                     if ( state==1 ) {
2713                         OP( convert ) = nodetype;
2714                         str=STRING(convert);
2715                         STR_LEN(convert)=0;
2716                     }
2717                     STR_LEN(convert) += len;
2718                     while (len--)
2719                         *str++ = *ch++;
2720                 } else {
2721 #ifdef DEBUGGING
2722                     if (state>1)
2723                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2724 #endif
2725                     break;
2726                 }
2727             }
2728             trie->prefixlen = (state-1);
2729             if (str) {
2730                 regnode *n = convert+NODE_SZ_STR(convert);
2731                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2732                 trie->startstate = state;
2733                 trie->minlen -= (state - 1);
2734                 trie->maxlen -= (state - 1);
2735 #ifdef DEBUGGING
2736                /* At least the UNICOS C compiler choked on this
2737                 * being argument to DEBUG_r(), so let's just have
2738                 * it right here. */
2739                if (
2740 #ifdef PERL_EXT_RE_BUILD
2741                    1
2742 #else
2743                    DEBUG_r_TEST
2744 #endif
2745                    ) {
2746                    regnode *fix = convert;
2747                    U32 word = trie->wordcount;
2748                    mjd_nodelen++;
2749                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2750                    while( ++fix < n ) {
2751                        Set_Node_Offset_Length(fix, 0, 0);
2752                    }
2753                    while (word--) {
2754                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2755                        if (tmp) {
2756                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2757                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2758                            else
2759                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2760                        }
2761                    }
2762                }
2763 #endif
2764                 if (trie->maxlen) {
2765                     convert = n;
2766                 } else {
2767                     NEXT_OFF(convert) = (U16)(tail - convert);
2768                     DEBUG_r(optimize= n);
2769                 }
2770             }
2771         }
2772         if (!jumper)
2773             jumper = last;
2774         if ( trie->maxlen ) {
2775             NEXT_OFF( convert ) = (U16)(tail - convert);
2776             ARG_SET( convert, data_slot );
2777             /* Store the offset to the first unabsorbed branch in
2778                jump[0], which is otherwise unused by the jump logic.
2779                We use this when dumping a trie and during optimisation. */
2780             if (trie->jump)
2781                 trie->jump[0] = (U16)(nextbranch - convert);
2782
2783             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2784              *   and there is a bitmap
2785              *   and the first "jump target" node we found leaves enough room
2786              * then convert the TRIE node into a TRIEC node, with the bitmap
2787              * embedded inline in the opcode - this is hypothetically faster.
2788              */
2789             if ( !trie->states[trie->startstate].wordnum
2790                  && trie->bitmap
2791                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2792             {
2793                 OP( convert ) = TRIEC;
2794                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2795                 PerlMemShared_free(trie->bitmap);
2796                 trie->bitmap= NULL;
2797             } else
2798                 OP( convert ) = TRIE;
2799
2800             /* store the type in the flags */
2801             convert->flags = nodetype;
2802             DEBUG_r({
2803             optimize = convert
2804                       + NODE_STEP_REGNODE
2805                       + regarglen[ OP( convert ) ];
2806             });
2807             /* XXX We really should free up the resource in trie now,
2808                    as we won't use them - (which resources?) dmq */
2809         }
2810         /* needed for dumping*/
2811         DEBUG_r(if (optimize) {
2812             regnode *opt = convert;
2813
2814             while ( ++opt < optimize) {
2815                 Set_Node_Offset_Length(opt,0,0);
2816             }
2817             /*
2818                 Try to clean up some of the debris left after the
2819                 optimisation.
2820              */
2821             while( optimize < jumper ) {
2822                 mjd_nodelen += Node_Length((optimize));
2823                 OP( optimize ) = OPTIMIZED;
2824                 Set_Node_Offset_Length(optimize,0,0);
2825                 optimize++;
2826             }
2827             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2828         });
2829     } /* end node insert */
2830
2831     /*  Finish populating the prev field of the wordinfo array.  Walk back
2832      *  from each accept state until we find another accept state, and if
2833      *  so, point the first word's .prev field at the second word. If the
2834      *  second already has a .prev field set, stop now. This will be the
2835      *  case either if we've already processed that word's accept state,
2836      *  or that state had multiple words, and the overspill words were
2837      *  already linked up earlier.
2838      */
2839     {
2840         U16 word;
2841         U32 state;
2842         U16 prev;
2843
2844         for (word=1; word <= trie->wordcount; word++) {
2845             prev = 0;
2846             if (trie->wordinfo[word].prev)
2847                 continue;
2848             state = trie->wordinfo[word].accept;
2849             while (state) {
2850                 state = prev_states[state];
2851                 if (!state)
2852                     break;
2853                 prev = trie->states[state].wordnum;
2854                 if (prev)
2855                     break;
2856             }
2857             trie->wordinfo[word].prev = prev;
2858         }
2859         Safefree(prev_states);
2860     }
2861
2862
2863     /* and now dump out the compressed format */
2864     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2865
2866     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2867 #ifdef DEBUGGING
2868     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2869     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2870 #else
2871     SvREFCNT_dec_NN(revcharmap);
2872 #endif
2873     return trie->jump
2874            ? MADE_JUMP_TRIE
2875            : trie->startstate>1
2876              ? MADE_EXACT_TRIE
2877              : MADE_TRIE;
2878 }
2879
2880 STATIC void
2881 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2882 {
2883 /* The Trie is constructed and compressed now so we can build a fail array if
2884  * it's needed
2885
2886    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2887    3.32 in the
2888    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2889    Ullman 1985/88
2890    ISBN 0-201-10088-6
2891
2892    We find the fail state for each state in the trie, this state is the longest
2893    proper suffix of the current state's 'word' that is also a proper prefix of
2894    another word in our trie. State 1 represents the word '' and is thus the
2895    default fail state. This allows the DFA not to have to restart after its
2896    tried and failed a word at a given point, it simply continues as though it
2897    had been matching the other word in the first place.
2898    Consider
2899       'abcdgu'=~/abcdefg|cdgu/
2900    When we get to 'd' we are still matching the first word, we would encounter
2901    'g' which would fail, which would bring us to the state representing 'd' in
2902    the second word where we would try 'g' and succeed, proceeding to match
2903    'cdgu'.
2904  */
2905  /* add a fail transition */
2906     const U32 trie_offset = ARG(source);
2907     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2908     U32 *q;
2909     const U32 ucharcount = trie->uniquecharcount;
2910     const U32 numstates = trie->statecount;
2911     const U32 ubound = trie->lasttrans + ucharcount;
2912     U32 q_read = 0;
2913     U32 q_write = 0;
2914     U32 charid;
2915     U32 base = trie->states[ 1 ].trans.base;
2916     U32 *fail;
2917     reg_ac_data *aho;
2918     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2919     GET_RE_DEBUG_FLAGS_DECL;
2920
2921     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2922 #ifndef DEBUGGING
2923     PERL_UNUSED_ARG(depth);
2924 #endif
2925
2926
2927     ARG_SET( stclass, data_slot );
2928     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2929     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2930     aho->trie=trie_offset;
2931     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2932     Copy( trie->states, aho->states, numstates, reg_trie_state );
2933     Newxz( q, numstates, U32);
2934     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2935     aho->refcount = 1;
2936     fail = aho->fail;
2937     /* initialize fail[0..1] to be 1 so that we always have
2938        a valid final fail state */
2939     fail[ 0 ] = fail[ 1 ] = 1;
2940
2941     for ( charid = 0; charid < ucharcount ; charid++ ) {
2942         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2943         if ( newstate ) {
2944             q[ q_write ] = newstate;
2945             /* set to point at the root */
2946             fail[ q[ q_write++ ] ]=1;
2947         }
2948     }
2949     while ( q_read < q_write) {
2950         const U32 cur = q[ q_read++ % numstates ];
2951         base = trie->states[ cur ].trans.base;
2952
2953         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2954             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2955             if (ch_state) {
2956                 U32 fail_state = cur;
2957                 U32 fail_base;
2958                 do {
2959                     fail_state = fail[ fail_state ];
2960                     fail_base = aho->states[ fail_state ].trans.base;
2961                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2962
2963                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2964                 fail[ ch_state ] = fail_state;
2965                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2966                 {
2967                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2968                 }
2969                 q[ q_write++ % numstates] = ch_state;
2970             }
2971         }
2972     }
2973     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2974        when we fail in state 1, this allows us to use the
2975        charclass scan to find a valid start char. This is based on the principle
2976        that theres a good chance the string being searched contains lots of stuff
2977        that cant be a start char.
2978      */
2979     fail[ 0 ] = fail[ 1 ] = 0;
2980     DEBUG_TRIE_COMPILE_r({
2981         PerlIO_printf(Perl_debug_log,
2982                       "%*sStclass Failtable (%"UVuf" states): 0",
2983                       (int)(depth * 2), "", (UV)numstates
2984         );
2985         for( q_read=1; q_read<numstates; q_read++ ) {
2986             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2987         }
2988         PerlIO_printf(Perl_debug_log, "\n");
2989     });
2990     Safefree(q);
2991     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2992 }
2993
2994
2995 #define DEBUG_PEEP(str,scan,depth) \
2996     DEBUG_OPTIMISE_r({if (scan){ \
2997        SV * const mysv=sv_newmortal(); \
2998        regnode *Next = regnext(scan); \
2999        regprop(RExC_rx, mysv, scan); \
3000        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3001        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3002        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3003    }});
3004
3005
3006 /* The below joins as many adjacent EXACTish nodes as possible into a single
3007  * one.  The regop may be changed if the node(s) contain certain sequences that
3008  * require special handling.  The joining is only done if:
3009  * 1) there is room in the current conglomerated node to entirely contain the
3010  *    next one.
3011  * 2) they are the exact same node type
3012  *
3013  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3014  * these get optimized out
3015  *
3016  * If a node is to match under /i (folded), the number of characters it matches
3017  * can be different than its character length if it contains a multi-character
3018  * fold.  *min_subtract is set to the total delta of the input nodes.
3019  *
3020  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
3021  * and contains LATIN SMALL LETTER SHARP S
3022  *
3023  * This is as good a place as any to discuss the design of handling these
3024  * multi-character fold sequences.  It's been wrong in Perl for a very long
3025  * time.  There are three code points in Unicode whose multi-character folds
3026  * were long ago discovered to mess things up.  The previous designs for
3027  * dealing with these involved assigning a special node for them.  This
3028  * approach doesn't work, as evidenced by this example:
3029  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3030  * Both these fold to "sss", but if the pattern is parsed to create a node that
3031  * would match just the \xDF, it won't be able to handle the case where a
3032  * successful match would have to cross the node's boundary.  The new approach
3033  * that hopefully generally solves the problem generates an EXACTFU_SS node
3034  * that is "sss".
3035  *
3036  * It turns out that there are problems with all multi-character folds, and not
3037  * just these three.  Now the code is general, for all such cases.  The
3038  * approach taken is:
3039  * 1)   This routine examines each EXACTFish node that could contain multi-
3040  *      character fold sequences.  It returns in *min_subtract how much to
3041  *      subtract from the the actual length of the string to get a real minimum
3042  *      match length; it is 0 if there are no multi-char folds.  This delta is
3043  *      used by the caller to adjust the min length of the match, and the delta
3044  *      between min and max, so that the optimizer doesn't reject these
3045  *      possibilities based on size constraints.
3046  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3047  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3048  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3049  *      there is a possible fold length change.  That means that a regular
3050  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3051  *      with length changes, and so can be processed faster.  regexec.c takes
3052  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3053  *      pre-folded by regcomp.c.  This saves effort in regex matching.
3054  *      However, the pre-folding isn't done for non-UTF8 patterns because the
3055  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3056  *      down by forcing the pattern into UTF8 unless necessary.  Also what
3057  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
3058  *      possibilities for the non-UTF8 patterns are quite simple, except for
3059  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3060  *      members of a fold-pair, and arrays are set up for all of them so that
3061  *      the other member of the pair can be found quickly.  Code elsewhere in
3062  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3063  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3064  *      described in the next item.
3065  * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
3066  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3067  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
3068  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
3069  *      character in the pattern corresponds to at most a single character in
3070  *      the target string.  (And I do mean character, and not byte here, unlike
3071  *      other parts of the documentation that have never been updated to
3072  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
3073  *      two character string 'ss'; in EXACTFA nodes it can match
3074  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
3075  *      instances where it is violated.  I'm reluctant to try to change the
3076  *      assumption, as the code involved is impenetrable to me (khw), so
3077  *      instead the code here punts.  This routine examines (when the pattern
3078  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3079  *      boolean indicating whether or not the node contains a sharp s.  When it
3080  *      is true, the caller sets a flag that later causes the optimizer in this
3081  *      file to not set values for the floating and fixed string lengths, and
3082  *      thus avoids the optimizer code in regexec.c that makes the invalid
3083  *      assumption.  Thus, there is no optimization based on string lengths for
3084  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3085  *      (The reason the assumption is wrong only in these two cases is that all
3086  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3087  *      other folds to their expanded versions.  We can't prefold sharp s to
3088  *      'ss' in EXACTF nodes because we don't know at compile time if it
3089  *      actually matches 'ss' or not.  It will match iff the target string is
3090  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3091  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
3092  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3093  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3094  *      require the pattern to be forced into UTF-8, the overhead of which we
3095  *      want to avoid.)
3096  *
3097  *      Similarly, the code that generates tries doesn't currently handle
3098  *      not-already-folded multi-char folds, and it looks like a pain to change
3099  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3100  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3101  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3102  *      using /iaa matching will be doing so almost entirely with ASCII
3103  *      strings, so this should rarely be encountered in practice */
3104
3105 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3106     if (PL_regkind[OP(scan)] == EXACT) \
3107         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3108
3109 STATIC U32
3110 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3111     /* Merge several consecutive EXACTish nodes into one. */
3112     regnode *n = regnext(scan);
3113     U32 stringok = 1;
3114     regnode *next = scan + NODE_SZ_STR(scan);
3115     U32 merged = 0;
3116     U32 stopnow = 0;
3117 #ifdef DEBUGGING
3118     regnode *stop = scan;
3119     GET_RE_DEBUG_FLAGS_DECL;
3120 #else
3121     PERL_UNUSED_ARG(depth);
3122 #endif
3123
3124     PERL_ARGS_ASSERT_JOIN_EXACT;
3125 #ifndef EXPERIMENTAL_INPLACESCAN
3126     PERL_UNUSED_ARG(flags);
3127     PERL_UNUSED_ARG(val);
3128 #endif
3129     DEBUG_PEEP("join",scan,depth);
3130
3131     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3132      * EXACT ones that are mergeable to the current one. */
3133     while (n
3134            && (PL_regkind[OP(n)] == NOTHING
3135                || (stringok && OP(n) == OP(scan)))
3136            && NEXT_OFF(n)
3137            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3138     {
3139
3140         if (OP(n) == TAIL || n > next)
3141             stringok = 0;
3142         if (PL_regkind[OP(n)] == NOTHING) {
3143             DEBUG_PEEP("skip:",n,depth);
3144             NEXT_OFF(scan) += NEXT_OFF(n);
3145             next = n + NODE_STEP_REGNODE;
3146 #ifdef DEBUGGING
3147             if (stringok)
3148                 stop = n;
3149 #endif
3150             n = regnext(n);
3151         }
3152         else if (stringok) {
3153             const unsigned int oldl = STR_LEN(scan);
3154             regnode * const nnext = regnext(n);
3155
3156             /* XXX I (khw) kind of doubt that this works on platforms where
3157              * U8_MAX is above 255 because of lots of other assumptions */
3158             /* Don't join if the sum can't fit into a single node */
3159             if (oldl + STR_LEN(n) > U8_MAX)
3160                 break;
3161
3162             DEBUG_PEEP("merg",n,depth);
3163             merged++;
3164
3165             NEXT_OFF(scan) += NEXT_OFF(n);
3166             STR_LEN(scan) += STR_LEN(n);
3167             next = n + NODE_SZ_STR(n);
3168             /* Now we can overwrite *n : */
3169             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3170 #ifdef DEBUGGING
3171             stop = next - 1;
3172 #endif
3173             n = nnext;
3174             if (stopnow) break;
3175         }
3176
3177 #ifdef EXPERIMENTAL_INPLACESCAN
3178         if (flags && !NEXT_OFF(n)) {
3179             DEBUG_PEEP("atch", val, depth);
3180             if (reg_off_by_arg[OP(n)]) {
3181                 ARG_SET(n, val - n);
3182             }
3183             else {
3184                 NEXT_OFF(n) = val - n;
3185             }
3186             stopnow = 1;
3187         }
3188 #endif
3189     }
3190
3191     *min_subtract = 0;
3192     *has_exactf_sharp_s = FALSE;
3193
3194     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3195      * can now analyze for sequences of problematic code points.  (Prior to
3196      * this final joining, sequences could have been split over boundaries, and
3197      * hence missed).  The sequences only happen in folding, hence for any
3198      * non-EXACT EXACTish node */
3199     if (OP(scan) != EXACT) {
3200         const U8 * const s0 = (U8*) STRING(scan);
3201         const U8 * s = s0;
3202         const U8 * const s_end = s0 + STR_LEN(scan);
3203
3204         /* One pass is made over the node's string looking for all the
3205          * possibilities.  to avoid some tests in the loop, there are two main
3206          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3207          * non-UTF-8 */
3208         if (UTF) {
3209
3210             /* Examine the string for a multi-character fold sequence.  UTF-8
3211              * patterns have all characters pre-folded by the time this code is
3212              * executed */
3213             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3214                                      length sequence we are looking for is 2 */
3215             {
3216                 int count = 0;
3217                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3218                 if (! len) {    /* Not a multi-char fold: get next char */
3219                     s += UTF8SKIP(s);
3220                     continue;
3221                 }
3222
3223                 /* Nodes with 'ss' require special handling, except for EXACTFL
3224                  * and EXACTFA-ish for which there is no multi-char fold to
3225                  * this */
3226                 if (len == 2 && *s == 's' && *(s+1) == 's'
3227                     && OP(scan) != EXACTFL
3228                     && OP(scan) != EXACTFA
3229                     && OP(scan) != EXACTFA_NO_TRIE)
3230                 {
3231                     count = 2;
3232                     OP(scan) = EXACTFU_SS;
3233                     s += 2;
3234                 }
3235                 else { /* Here is a generic multi-char fold. */
3236                     const U8* multi_end  = s + len;
3237
3238                     /* Count how many characters in it.  In the case of /l and
3239                      * /aa, no folds which contain ASCII code points are
3240                      * allowed, so check for those, and skip if found.  (In
3241                      * EXACTFL, no folds are allowed to any Latin1 code point,
3242                      * not just ASCII.  But there aren't any of these
3243                      * currently, nor ever likely, so don't take the time to
3244                      * test for them.  The code that generates the
3245                      * is_MULTI_foo() macros croaks should one actually get put
3246                      * into Unicode .) */
3247                     if (OP(scan) != EXACTFL
3248                         && OP(scan) != EXACTFA
3249                         && OP(scan) != EXACTFA_NO_TRIE)
3250                     {
3251                         count = utf8_length(s, multi_end);
3252                         s = multi_end;
3253                     }
3254                     else {
3255                         while (s < multi_end) {
3256                             if (isASCII(*s)) {
3257                                 s++;
3258                                 goto next_iteration;
3259                             }
3260                             else {
3261                                 s += UTF8SKIP(s);
3262                             }
3263                             count++;
3264                         }
3265                     }
3266                 }
3267
3268                 /* The delta is how long the sequence is minus 1 (1 is how long
3269                  * the character that folds to the sequence is) */
3270                 *min_subtract += count - 1;
3271             next_iteration: ;
3272             }
3273         }
3274         else if (OP(scan) == EXACTFA) {
3275
3276             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3277              * fold to the ASCII range (and there are no existing ones in the
3278              * upper latin1 range).  But, as outlined in the comments preceding
3279              * this function, we need to flag any occurrences of the sharp s.
3280              * This character forbids trie formation (because of added
3281              * complexity) */
3282             while (s < s_end) {
3283                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3284                     OP(scan) = EXACTFA_NO_TRIE;
3285                     *has_exactf_sharp_s = TRUE;
3286                     break;
3287                 }
3288                 s++;
3289                 continue;
3290             }
3291         }
3292         else if (OP(scan) != EXACTFL) {
3293
3294             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
3295              * multi-char folds that are all Latin1.  (This code knows that
3296              * there are no current multi-char folds possible with EXACTFL,
3297              * relying on fold_grind.t to catch any errors if the very unlikely
3298              * event happens that some get added in future Unicode versions.)
3299              * As explained in the comments preceding this function, we look
3300              * also for the sharp s in EXACTF nodes; it can be in the final
3301              * position.  Otherwise we can stop looking 1 byte earlier because
3302              * have to find at least two characters for a multi-fold */
3303             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3304
3305             while (s < upper) {
3306                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3307                 if (! len) {    /* Not a multi-char fold. */
3308                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3309                     {
3310                         *has_exactf_sharp_s = TRUE;
3311                     }
3312                     s++;
3313                     continue;
3314                 }
3315
3316                 if (len == 2
3317                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3318                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3319                 {
3320
3321                     /* EXACTF nodes need to know that the minimum length
3322                      * changed so that a sharp s in the string can match this
3323                      * ss in the pattern, but they remain EXACTF nodes, as they
3324                      * won't match this unless the target string is is UTF-8,
3325                      * which we don't know until runtime */
3326                     if (OP(scan) != EXACTF) {
3327                         OP(scan) = EXACTFU_SS;
3328                     }
3329                 }
3330
3331                 *min_subtract += len - 1;
3332                 s += len;
3333             }
3334         }
3335     }
3336
3337 #ifdef DEBUGGING
3338     /* Allow dumping but overwriting the collection of skipped
3339      * ops and/or strings with fake optimized ops */
3340     n = scan + NODE_SZ_STR(scan);
3341     while (n <= stop) {
3342         OP(n) = OPTIMIZED;
3343         FLAGS(n) = 0;
3344         NEXT_OFF(n) = 0;
3345         n++;
3346     }
3347 #endif
3348     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3349     return stopnow;
3350 }
3351
3352 /* REx optimizer.  Converts nodes into quicker variants "in place".
3353    Finds fixed substrings.  */
3354
3355 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3356    to the position after last scanned or to NULL. */
3357
3358 #define INIT_AND_WITHP \
3359     assert(!and_withp); \
3360     Newx(and_withp,1, regnode_ssc); \
3361     SAVEFREEPV(and_withp)
3362
3363 /* this is a chain of data about sub patterns we are processing that
3364    need to be handled separately/specially in study_chunk. Its so
3365    we can simulate recursion without losing state.  */
3366 struct scan_frame;
3367 typedef struct scan_frame {
3368     regnode *last;  /* last node to process in this frame */
3369     regnode *next;  /* next node to process when last is reached */
3370     struct scan_frame *prev; /*previous frame*/
3371     U32 prev_recursed_depth;
3372     I32 stop; /* what stopparen do we use */
3373 } scan_frame;
3374
3375
3376 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3377
3378 STATIC SSize_t
3379 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3380                         SSize_t *minlenp, SSize_t *deltap,
3381                         regnode *last,
3382                         scan_data_t *data,
3383                         I32 stopparen,
3384                         U32 recursed_depth,
3385                         regnode_ssc *and_withp,
3386                         U32 flags, U32 depth)
3387                         /* scanp: Start here (read-write). */
3388                         /* deltap: Write maxlen-minlen here. */
3389                         /* last: Stop before this one. */
3390                         /* data: string data about the pattern */
3391                         /* stopparen: treat close N as END */
3392                         /* recursed: which subroutines have we recursed into */
3393                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3394 {
3395     dVAR;
3396     /* There must be at least this number of characters to match */
3397     SSize_t min = 0;
3398     I32 pars = 0, code;
3399     regnode *scan = *scanp, *next;
3400     SSize_t delta = 0;
3401     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3402     int is_inf_internal = 0;            /* The studied chunk is infinite */
3403     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3404     scan_data_t data_fake;
3405     SV *re_trie_maxbuff = NULL;
3406     regnode *first_non_open = scan;
3407     SSize_t stopmin = SSize_t_MAX;
3408     scan_frame *frame = NULL;
3409     GET_RE_DEBUG_FLAGS_DECL;
3410
3411     PERL_ARGS_ASSERT_STUDY_CHUNK;
3412
3413 #ifdef DEBUGGING
3414     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3415 #endif
3416     if ( depth == 0 ) {
3417         while (first_non_open && OP(first_non_open) == OPEN)
3418             first_non_open=regnext(first_non_open);
3419     }
3420
3421
3422   fake_study_recurse:
3423     while ( scan && OP(scan) != END && scan < last ){
3424         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3425                                    node length to get a real minimum (because
3426                                    the folded version may be shorter) */
3427         bool has_exactf_sharp_s = FALSE;
3428         /* Peephole optimizer: */
3429         DEBUG_OPTIMISE_MORE_r(
3430         {
3431             PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3432                           ((int) depth*2), "", (long)stopparen,
3433                           (unsigned long)depth, (unsigned long)recursed_depth);
3434             if (recursed_depth) {
3435                 U32 i;
3436                 U32 j;
3437                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3438                     PerlIO_printf(Perl_debug_log,"[");
3439                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3440                         PerlIO_printf(Perl_debug_log,"%d",
3441                             PAREN_TEST(RExC_study_chunk_recursed +
3442                                        (j * RExC_study_chunk_recursed_bytes), i)
3443                             ? 1 : 0
3444                         );
3445                     PerlIO_printf(Perl_debug_log,"]");
3446                 }
3447             }
3448             PerlIO_printf(Perl_debug_log,"\n");
3449         }
3450         );
3451         DEBUG_STUDYDATA("Peep:", data, depth);
3452         DEBUG_PEEP("Peep", scan, depth);
3453
3454
3455         /* Its not clear to khw or hv why this is done here, and not in the
3456          * clauses that deal with EXACT nodes.  khw's guess is that it's
3457          * because of a previous design */
3458         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3459
3460         /* Follow the next-chain of the current node and optimize
3461            away all the NOTHINGs from it.  */
3462         if (OP(scan) != CURLYX) {
3463             const int max = (reg_off_by_arg[OP(scan)]
3464                        ? I32_MAX
3465                        /* I32 may be smaller than U16 on CRAYs! */
3466                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3467             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3468             int noff;
3469             regnode *n = scan;
3470
3471             /* Skip NOTHING and LONGJMP. */
3472             while ((n = regnext(n))
3473                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3474                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3475                    && off + noff < max)
3476                 off += noff;
3477             if (reg_off_by_arg[OP(scan)])
3478                 ARG(scan) = off;
3479             else
3480                 NEXT_OFF(scan) = off;
3481         }
3482
3483
3484
3485         /* The principal pseudo-switch.  Cannot be a switch, since we
3486            look into several different things.  */
3487         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3488                    || OP(scan) == IFTHEN) {
3489             next = regnext(scan);
3490             code = OP(scan);
3491             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3492
3493             if (OP(next) == code || code == IFTHEN) {
3494                 /* NOTE - There is similar code to this block below for
3495                  * handling TRIE nodes on a re-study.  If you change stuff here
3496                  * check there too. */
3497                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3498                 regnode_ssc accum;
3499                 regnode * const startbranch=scan;
3500
3501                 if (flags & SCF_DO_SUBSTR)
3502                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3503                 if (flags & SCF_DO_STCLASS)
3504                     ssc_init_zero(pRExC_state, &accum);
3505
3506                 while (OP(scan) == code) {
3507                     SSize_t deltanext, minnext, fake;
3508                     I32 f = 0;
3509                     regnode_ssc this_class;
3510
3511                     num++;
3512                     data_fake.flags = 0;
3513                     if (data) {
3514                         data_fake.whilem_c = data->whilem_c;
3515                         data_fake.last_closep = data->last_closep;
3516                     }
3517                     else
3518                         data_fake.last_closep = &fake;
3519
3520                     data_fake.pos_delta = delta;
3521                     next = regnext(scan);
3522                     scan = NEXTOPER(scan);
3523                     if (code != BRANCH)
3524                         scan = NEXTOPER(scan);
3525                     if (flags & SCF_DO_STCLASS) {
3526                         ssc_init(pRExC_state, &this_class);
3527                         data_fake.start_class = &this_class;
3528                         f = SCF_DO_STCLASS_AND;
3529                     }
3530                     if (flags & SCF_WHILEM_VISITED_POS)
3531                         f |= SCF_WHILEM_VISITED_POS;
3532
3533                     /* we suppose the run is continuous, last=next...*/
3534                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3535                                           next, &data_fake,
3536                                           stopparen, recursed_depth, NULL, f,depth+1);
3537                     if (min1 > minnext)
3538                         min1 = minnext;
3539                     if (deltanext == SSize_t_MAX) {
3540                         is_inf = is_inf_internal = 1;
3541                         max1 = SSize_t_MAX;
3542                     } else if (max1 < minnext + deltanext)
3543                         max1 = minnext + deltanext;
3544                     scan = next;
3545                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3546                         pars++;
3547                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3548                         if ( stopmin > minnext)
3549                             stopmin = min + min1;
3550                         flags &= ~SCF_DO_SUBSTR;
3551                         if (data)
3552                             data->flags |= SCF_SEEN_ACCEPT;
3553                     }
3554                     if (data) {
3555                         if (data_fake.flags & SF_HAS_EVAL)
3556                             data->flags |= SF_HAS_EVAL;
3557                         data->whilem_c = data_fake.whilem_c;
3558                     }
3559                     if (flags & SCF_DO_STCLASS)
3560                         ssc_or(pRExC_state, &accum, &this_class);
3561                 }
3562                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3563                     min1 = 0;
3564                 if (flags & SCF_DO_SUBSTR) {
3565                     data->pos_min += min1;
3566                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3567                         data->pos_delta = SSize_t_MAX;
3568                     else
3569                         data->pos_delta += max1 - min1;
3570                     if (max1 != min1 || is_inf)
3571                         data->longest = &(data->longest_float);
3572                 }
3573                 min += min1;
3574                 if (delta == SSize_t_MAX
3575                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3576                     delta = SSize_t_MAX;
3577                 else
3578                     delta += max1 - min1;
3579                 if (flags & SCF_DO_STCLASS_OR) {
3580                     ssc_or(pRExC_state, data->start_class, &accum);
3581                     if (min1) {
3582                         ssc_and(pRExC_state, data->start_class, and_withp);
3583                         flags &= ~SCF_DO_STCLASS;
3584                     }
3585                 }
3586                 else if (flags & SCF_DO_STCLASS_AND) {
3587                     if (min1) {
3588                         ssc_and(pRExC_state, data->start_class, &accum);
3589                         flags &= ~SCF_DO_STCLASS;
3590                     }
3591                     else {
3592                         /* Switch to OR mode: cache the old value of
3593                          * data->start_class */
3594                         INIT_AND_WITHP;
3595                         StructCopy(data->start_class, and_withp, regnode_ssc);
3596                         flags &= ~SCF_DO_STCLASS_AND;
3597                         StructCopy(&accum, data->start_class, regnode_ssc);
3598                         flags |= SCF_DO_STCLASS_OR;
3599                     }
3600                 }
3601
3602                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3603                 /* demq.
3604
3605                    Assuming this was/is a branch we are dealing with: 'scan'
3606                    now points at the item that follows the branch sequence,
3607                    whatever it is. We now start at the beginning of the
3608                    sequence and look for subsequences of
3609
3610                    BRANCH->EXACT=>x1
3611                    BRANCH->EXACT=>x2
3612                    tail
3613
3614                    which would be constructed from a pattern like
3615                    /A|LIST|OF|WORDS/
3616
3617                    If we can find such a subsequence we need to turn the first
3618                    element into a trie and then add the subsequent branch exact
3619                    strings to the trie.
3620
3621                    We have two cases
3622
3623                      1. patterns where the whole set of branches can be
3624                         converted.
3625
3626                      2. patterns where only a subset can be converted.
3627
3628                    In case 1 we can replace the whole set with a single regop
3629                    for the trie. In case 2 we need to keep the start and end
3630                    branches so
3631
3632                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3633                      becomes BRANCH TRIE; BRANCH X;
3634
3635                   There is an additional case, that being where there is a
3636                   common prefix, which gets split out into an EXACT like node
3637                   preceding the TRIE node.
3638
3639                   If x(1..n)==tail then we can do a simple trie, if not we make
3640                   a "jump" trie, such that when we match the appropriate word
3641                   we "jump" to the appropriate tail node. Essentially we turn
3642                   a nested if into a case structure of sorts.
3643
3644                 */
3645
3646                     int made=0;
3647                     if (!re_trie_maxbuff) {
3648                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3649                         if (!SvIOK(re_trie_maxbuff))
3650                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3651                     }
3652                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3653                         regnode *cur;
3654                         regnode *first = (regnode *)NULL;
3655                         regnode *last = (regnode *)NULL;
3656                         regnode *tail = scan;
3657                         U8 trietype = 0;
3658                         U32 count=0;
3659
3660 #ifdef DEBUGGING
3661                         SV * const mysv = sv_newmortal();       /* for dumping */
3662 #endif
3663                         /* var tail is used because there may be a TAIL
3664                            regop in the way. Ie, the exacts will point to the
3665                            thing following the TAIL, but the last branch will
3666                            point at the TAIL. So we advance tail. If we
3667                            have nested (?:) we may have to move through several
3668                            tails.
3669                          */
3670
3671                         while ( OP( tail ) == TAIL ) {
3672                             /* this is the TAIL generated by (?:) */
3673                             tail = regnext( tail );
3674                         }
3675
3676
3677                         DEBUG_TRIE_COMPILE_r({
3678                             regprop(RExC_rx, mysv, tail );
3679                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3680                                 (int)depth * 2 + 2, "",
3681                                 "Looking for TRIE'able sequences. Tail node is: ",
3682                                 SvPV_nolen_const( mysv )
3683                             );
3684                         });
3685
3686                         /*
3687
3688                             Step through the branches
3689                                 cur represents each branch,
3690                                 noper is the first thing to be matched as part
3691                                       of that branch
3692                                 noper_next is the regnext() of that node.
3693
3694                             We normally handle a case like this
3695                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3696                             support building with NOJUMPTRIE, which restricts
3697                             the trie logic to structures like /FOO|BAR/.
3698
3699                             If noper is a trieable nodetype then the branch is
3700                             a possible optimization target. If we are building
3701                             under NOJUMPTRIE then we require that noper_next is
3702                             the same as scan (our current position in the regex
3703                             program).
3704
3705                             Once we have two or more consecutive such branches
3706                             we can create a trie of the EXACT's contents and
3707                             stitch it in place into the program.
3708
3709                             If the sequence represents all of the branches in
3710                             the alternation we replace the entire thing with a
3711                             single TRIE node.
3712
3713                             Otherwise when it is a subsequence we need to
3714                             stitch it in place and replace only the relevant
3715                             branches. This means the first branch has to remain
3716                             as it is used by the alternation logic, and its
3717                             next pointer, and needs to be repointed at the item
3718                             on the branch chain following the last branch we
3719                             have optimized away.
3720
3721                             This could be either a BRANCH, in which case the
3722                             subsequence is internal, or it could be the item
3723                             following the branch sequence in which case the
3724                             subsequence is at the end (which does not
3725                             necessarily mean the first node is the start of the
3726                             alternation).
3727
3728                             TRIE_TYPE(X) is a define which maps the optype to a
3729                             trietype.
3730
3731                                 optype          |  trietype
3732                                 ----------------+-----------
3733                                 NOTHING         | NOTHING
3734                                 EXACT           | EXACT
3735                                 EXACTFU         | EXACTFU
3736                                 EXACTFU_SS      | EXACTFU
3737                                 EXACTFA         | EXACTFA
3738
3739
3740                         */
3741 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3742                        ( EXACT == (X) )   ? EXACT :        \
3743                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3744                        ( EXACTFA == (X) ) ? EXACTFA :        \
3745                        0 )
3746
3747                         /* dont use tail as the end marker for this traverse */
3748                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3749                             regnode * const noper = NEXTOPER( cur );
3750                             U8 noper_type = OP( noper );
3751                             U8 noper_trietype = TRIE_TYPE( noper_type );
3752 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3753                             regnode * const noper_next = regnext( noper );
3754                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3755                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3756 #endif
3757
3758                             DEBUG_TRIE_COMPILE_r({
3759                                 regprop(RExC_rx, mysv, cur);
3760                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3761                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3762
3763                                 regprop(RExC_rx, mysv, noper);
3764                                 PerlIO_printf( Perl_debug_log, " -> %s",
3765                                     SvPV_nolen_const(mysv));
3766
3767                                 if ( noper_next ) {
3768                                   regprop(RExC_rx, mysv, noper_next );
3769                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3770                                     SvPV_nolen_const(mysv));
3771                                 }
3772                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3773                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3774                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3775                                 );
3776                             });
3777
3778                             /* Is noper a trieable nodetype that can be merged
3779                              * with the current trie (if there is one)? */
3780                             if ( noper_trietype
3781                                   &&
3782                                   (
3783                                         ( noper_trietype == NOTHING)
3784                                         || ( trietype == NOTHING )
3785                                         || ( trietype == noper_trietype )
3786                                   )
3787 #ifdef NOJUMPTRIE
3788                                   && noper_next == tail
3789 #endif
3790                                   && count < U16_MAX)
3791                             {
3792                                 /* Handle mergable triable node Either we are
3793                                  * the first node in a new trieable sequence,
3794                                  * in which case we do some bookkeeping,
3795                                  * otherwise we update the end pointer. */
3796                                 if ( !first ) {
3797                                     first = cur;
3798                                     if ( noper_trietype == NOTHING ) {
3799 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3800                                         regnode * const noper_next = regnext( noper );
3801                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3802                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3803 #endif
3804
3805                                         if ( noper_next_trietype ) {
3806                                             trietype = noper_next_trietype;
3807                                         } else if (noper_next_type)  {
3808                                             /* a NOTHING regop is 1 regop wide.
3809                                              * We need at least two for a trie
3810                                              * so we can't merge this in */
3811                                             first = NULL;
3812                                         }
3813                                     } else {
3814                                         trietype = noper_trietype;
3815                                     }
3816                                 } else {
3817                                     if ( trietype == NOTHING )
3818                                         trietype = noper_trietype;
3819                                     last = cur;
3820                                 }
3821                                 if (first)
3822                                     count++;
3823                             } /* end handle mergable triable node */
3824                             else {
3825                                 /* handle unmergable node -
3826                                  * noper may either be a triable node which can
3827                                  * not be tried together with the current trie,
3828                                  * or a non triable node */
3829                                 if ( last ) {
3830                                     /* If last is set and trietype is not
3831                                      * NOTHING then we have found at least two
3832                                      * triable branch sequences in a row of a
3833                                      * similar trietype so we can turn them
3834                                      * into a trie. If/when we allow NOTHING to
3835                                      * start a trie sequence this condition
3836                                      * will be required, and it isn't expensive
3837                                      * so we leave it in for now. */
3838                                     if ( trietype && trietype != NOTHING )
3839                                         make_trie( pRExC_state,
3840                                                 startbranch, first, cur, tail, count,
3841                                                 trietype, depth+1 );
3842                                     last = NULL; /* note: we clear/update
3843                                                     first, trietype etc below,
3844                                                     so we dont do it here */
3845                                 }
3846                                 if ( noper_trietype
3847 #ifdef NOJUMPTRIE
3848                                      && noper_next == tail
3849 #endif
3850                                 ){
3851                                     /* noper is triable, so we can start a new
3852                                      * trie sequence */
3853                                     count = 1;
3854                                     first = cur;
3855                                     trietype = noper_trietype;
3856                                 } else if (first) {
3857                                     /* if we already saw a first but the
3858                                      * current node is not triable then we have
3859                                      * to reset the first information. */
3860                                     count = 0;
3861                                     first = NULL;
3862                                     trietype = 0;
3863                                 }
3864                             } /* end handle unmergable node */
3865                         } /* loop over branches */
3866                         DEBUG_TRIE_COMPILE_r({
3867                             regprop(RExC_rx, mysv, cur);
3868                             PerlIO_printf( Perl_debug_log,
3869                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3870                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3871
3872                         });
3873                         if ( last && trietype ) {
3874                             if ( trietype != NOTHING ) {
3875                                 /* the last branch of the sequence was part of
3876                                  * a trie, so we have to construct it here
3877                                  * outside of the loop */
3878                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3879 #ifdef TRIE_STUDY_OPT
3880                                 if ( ((made == MADE_EXACT_TRIE &&
3881                                      startbranch == first)
3882                                      || ( first_non_open == first )) &&
3883                                      depth==0 ) {
3884                                     flags |= SCF_TRIE_RESTUDY;
3885                                     if ( startbranch == first
3886                                          && scan == tail )
3887                                     {
3888                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3889                                     }
3890                                 }
3891 #endif
3892                             } else {
3893                                 /* at this point we know whatever we have is a
3894                                  * NOTHING sequence/branch AND if 'startbranch'
3895                                  * is 'first' then we can turn the whole thing
3896                                  * into a NOTHING
3897                                  */
3898                                 if ( startbranch == first ) {
3899                                     regnode *opt;
3900                                     /* the entire thing is a NOTHING sequence,
3901                                      * something like this: (?:|) So we can
3902                                      * turn it into a plain NOTHING op. */
3903                                     DEBUG_TRIE_COMPILE_r({
3904                                         regprop(RExC_rx, mysv, cur);
3905                                         PerlIO_printf( Perl_debug_log,
3906                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3907                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3908
3909                                     });
3910                                     OP(startbranch)= NOTHING;
3911                                     NEXT_OFF(startbranch)= tail - startbranch;
3912                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3913                                         OP(opt)= OPTIMIZED;
3914                                 }
3915                             }
3916                         } /* end if ( last) */
3917                     } /* TRIE_MAXBUF is non zero */
3918
3919                 } /* do trie */
3920
3921             }
3922             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3923                 scan = NEXTOPER(NEXTOPER(scan));
3924             } else                      /* single branch is optimized. */
3925                 scan = NEXTOPER(scan);
3926             continue;
3927         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3928             scan_frame *newframe = NULL;
3929             I32 paren;
3930             regnode *start;
3931             regnode *end;
3932             U32 my_recursed_depth= recursed_depth;
3933
3934             if (OP(scan) != SUSPEND) {
3935                 /* set the pointer */
3936                 if (OP(scan) == GOSUB) {
3937                     paren = ARG(scan);
3938                     RExC_recurse[ARG2L(scan)] = scan;
3939                     start = RExC_open_parens[paren-1];
3940                     end   = RExC_close_parens[paren-1];
3941                 } else {
3942                     paren = 0;
3943                     start = RExC_rxi->program + 1;
3944                     end   = RExC_opend;
3945                 }
3946                 if (!recursed_depth
3947                     ||
3948                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
3949                 ) {
3950                     if (!recursed_depth) {
3951                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
3952                     } else {
3953                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
3954                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
3955                              RExC_study_chunk_recursed_bytes, U8);
3956                     }
3957                     /* we havent recursed into this paren yet, so recurse into it */
3958                     DEBUG_STUDYDATA("set:", data,depth);
3959                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
3960                     my_recursed_depth= recursed_depth + 1;
3961                     Newx(newframe,1,scan_frame);
3962                 } else {
3963                     DEBUG_STUDYDATA("inf:", data,depth);
3964                     /* some form of infinite recursion, assume infinite length */
3965                     if (flags & SCF_DO_SUBSTR) {
3966                         SCAN_COMMIT(pRExC_state,data,minlenp);
3967                         data->longest = &(data->longest_float);
3968                     }
3969                     is_inf = is_inf_internal = 1;
3970                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3971                         ssc_anything(data->start_class);
3972                     flags &= ~SCF_DO_STCLASS;
3973                 }
3974             } else {
3975                 Newx(newframe,1,scan_frame);
3976                 paren = stopparen;
3977                 start = scan+2;
3978                 end = regnext(scan);
3979             }
3980             if (newframe) {
3981                 assert(start);
3982                 assert(end);
3983                 SAVEFREEPV(newframe);
3984                 newframe->next = regnext(scan);
3985                 newframe->last = last;
3986                 newframe->stop = stopparen;
3987                 newframe->prev = frame;
3988                 newframe->prev_recursed_depth = recursed_depth;
3989
3990                 DEBUG_STUDYDATA("frame-new:",data,depth);
3991                 DEBUG_PEEP("fnew", scan, depth);
3992
3993                 frame = newframe;
3994                 scan =  start;
3995                 stopparen = paren;
3996                 last = end;
3997                 depth = depth + 1;
3998                 recursed_depth= my_recursed_depth;
3999
4000                 continue;
4001             }
4002         }
4003         else if (OP(scan) == EXACT) {
4004             SSize_t l = STR_LEN(scan);
4005             UV uc;
4006             if (UTF) {
4007                 const U8 * const s = (U8*)STRING(scan);
4008                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4009                 l = utf8_length(s, s + l);
4010             } else {
4011                 uc = *((U8*)STRING(scan));
4012             }
4013             min += l;
4014             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4015                 /* The code below prefers earlier match for fixed
4016                    offset, later match for variable offset.  */
4017                 if (data->last_end == -1) { /* Update the start info. */
4018                     data->last_start_min = data->pos_min;
4019                     data->last_start_max = is_inf
4020                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4021                 }
4022                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4023                 if (UTF)
4024                     SvUTF8_on(data->last_found);
4025                 {
4026                     SV * const sv = data->last_found;
4027                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4028                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4029                     if (mg && mg->mg_len >= 0)
4030                         mg->mg_len += utf8_length((U8*)STRING(scan),
4031                                                   (U8*)STRING(scan)+STR_LEN(scan));
4032                 }
4033                 data->last_end = data->pos_min + l;
4034                 data->pos_min += l; /* As in the first entry. */
4035                 data->flags &= ~SF_BEFORE_EOL;
4036             }
4037
4038             /* ANDing the code point leaves at most it, and not in locale, and
4039              * can't match null string */
4040             if (flags & SCF_DO_STCLASS_AND) {
4041                 ssc_cp_and(data->start_class, uc);
4042                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4043                 ssc_clear_locale(data->start_class);
4044             }
4045             else if (flags & SCF_DO_STCLASS_OR) {
4046                 ssc_add_cp(data->start_class, uc);
4047                 ssc_and(pRExC_state, data->start_class, and_withp);
4048
4049                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4050                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4051             }
4052             flags &= ~SCF_DO_STCLASS;
4053         }
4054         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4055             SSize_t l = STR_LEN(scan);
4056             UV uc = *((U8*)STRING(scan));
4057             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4058                                                      separate code points */
4059
4060             /* Search for fixed substrings supports EXACT only. */
4061             if (flags & SCF_DO_SUBSTR) {
4062                 assert(data);
4063                 SCAN_COMMIT(pRExC_state, data, minlenp);
4064             }
4065             if (UTF) {
4066                 const U8 * const s = (U8 *)STRING(scan);
4067                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4068                 l = utf8_length(s, s + l);
4069             }
4070             if (has_exactf_sharp_s) {
4071                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
4072             }
4073             min += l - min_subtract;
4074             assert (min >= 0);
4075             delta += min_subtract;
4076             if (flags & SCF_DO_SUBSTR) {
4077                 data->pos_min += l - min_subtract;
4078                 if (data->pos_min < 0) {
4079                     data->pos_min = 0;
4080                 }
4081                 data->pos_delta += min_subtract;
4082                 if (min_subtract) {
4083                     data->longest = &(data->longest_float);
4084                 }
4085             }
4086             if (OP(scan) == EXACTFL) {
4087                 if (flags & SCF_DO_STCLASS_AND) {
4088                     ssc_flags_and(data->start_class,
4089                                                 ANYOF_LOCALE|ANYOF_LOC_FOLD);
4090                 }
4091                 else if (flags & SCF_DO_STCLASS_OR) {
4092                     ANYOF_FLAGS(data->start_class)
4093                                                 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
4094                 }
4095
4096                 /* We don't know what the folds are; it could be anything. XXX
4097                  * Actually, we only support UTF-8 encoding for code points
4098                  * above Latin1, so we could know what those folds are. */
4099                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4100                                                        0,
4101                                                        UV_MAX);
4102             }
4103             else {  /* Non-locale EXACTFish */
4104                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4105                 if (flags & SCF_DO_STCLASS_AND) {
4106                     ssc_clear_locale(data->start_class);
4107                 }
4108                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4109                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4110                                                        know if anything folds
4111                                                        with this */
4112                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4113                                                            PL_fold_latin1[uc]);
4114                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4115                                                       legal under /iaa */
4116                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4117                                 EXACTF_invlist
4118                                     = add_cp_to_invlist(EXACTF_invlist,
4119                                                 LATIN_SMALL_LETTER_SHARP_S);
4120                             }
4121                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4122                                 EXACTF_invlist
4123                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4124                                 EXACTF_invlist
4125                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4126                             }
4127                         }
4128
4129                         /* We also know if there are above-Latin1 code points
4130                          * that fold to this (none legal for ASCII and /iaa) */
4131                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4132                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4133                         {
4134                             /* XXX We could know exactly what does fold to this
4135                              * if the reverse folds are loaded, as currently in
4136                              * S_regclass() */
4137                             _invlist_union(EXACTF_invlist,
4138                                            PL_AboveLatin1,
4139                                            &EXACTF_invlist);
4140                         }
4141                     }
4142                 }
4143                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4144                            know what participates in folds with this, so have
4145                            to assume anything could */
4146
4147                     /* XXX We could know exactly what does fold to this if the
4148                      * reverse folds are loaded, as currently in S_regclass().
4149                      * But we do know that under /iaa nothing in the ASCII
4150                      * range can participate */
4151                     if (OP(scan) == EXACTFA) {
4152                         _invlist_union_complement_2nd(EXACTF_invlist,
4153                                                       PL_Posix_ptrs[_CC_ASCII],
4154                                                       &EXACTF_invlist);
4155                     }
4156                     else {
4157                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4158                                                                0, UV_MAX);
4159                     }
4160                 }
4161             }
4162             if (flags & SCF_DO_STCLASS_AND) {
4163                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4164                 ANYOF_POSIXL_ZERO(data->start_class);
4165                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4166             }
4167             else if (flags & SCF_DO_STCLASS_OR) {
4168                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4169                 ssc_and(pRExC_state, data->start_class, and_withp);
4170
4171                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4172                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4173             }
4174             flags &= ~SCF_DO_STCLASS;
4175             SvREFCNT_dec(EXACTF_invlist);
4176         }
4177         else if (REGNODE_VARIES(OP(scan))) {
4178             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4179             I32 fl = 0, f = flags;
4180             regnode * const oscan = scan;
4181             regnode_ssc this_class;
4182             regnode_ssc *oclass = NULL;
4183             I32 next_is_eval = 0;
4184
4185             switch (PL_regkind[OP(scan)]) {
4186             case WHILEM:                /* End of (?:...)* . */
4187                 scan = NEXTOPER(scan);
4188                 goto finish;
4189             case PLUS:
4190                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4191                     next = NEXTOPER(scan);
4192                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4193                         mincount = 1;
4194                         maxcount = REG_INFTY;
4195                         next = regnext(scan);
4196                         scan = NEXTOPER(scan);
4197                         goto do_curly;
4198                     }
4199                 }
4200                 if (flags & SCF_DO_SUBSTR)
4201                     data->pos_min++;
4202                 min++;
4203                 /* Fall through. */
4204             case STAR:
4205                 if (flags & SCF_DO_STCLASS) {
4206                     mincount = 0;
4207                     maxcount = REG_INFTY;
4208                     next = regnext(scan);
4209                     scan = NEXTOPER(scan);
4210                     goto do_curly;
4211                 }
4212                 is_inf = is_inf_internal = 1;
4213                 scan = regnext(scan);
4214                 if (flags & SCF_DO_SUBSTR) {
4215                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4216                     data->longest = &(data->longest_float);
4217                 }
4218                 goto optimize_curly_tail;
4219             case CURLY:
4220                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4221                     && (scan->flags == stopparen))
4222                 {
4223                     mincount = 1;
4224                     maxcount = 1;
4225                 } else {
4226                     mincount = ARG1(scan);
4227                     maxcount = ARG2(scan);
4228                 }
4229                 next = regnext(scan);
4230                 if (OP(scan) == CURLYX) {
4231                     I32 lp = (data ? *(data->last_closep) : 0);
4232                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4233                 }
4234                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4235                 next_is_eval = (OP(scan) == EVAL);
4236               do_curly:
4237                 if (flags & SCF_DO_SUBSTR) {
4238                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4239                     pos_before = data->pos_min;
4240                 }
4241                 if (data) {
4242                     fl = data->flags;
4243                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4244                     if (is_inf)
4245                         data->flags |= SF_IS_INF;
4246                 }
4247                 if (flags & SCF_DO_STCLASS) {
4248                     ssc_init(pRExC_state, &this_class);
4249                     oclass = data->start_class;
4250                     data->start_class = &this_class;
4251                     f |= SCF_DO_STCLASS_AND;
4252                     f &= ~SCF_DO_STCLASS_OR;
4253                 }
4254                 /* Exclude from super-linear cache processing any {n,m}
4255                    regops for which the combination of input pos and regex
4256                    pos is not enough information to determine if a match
4257                    will be possible.
4258
4259                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4260                    regex pos at the \s*, the prospects for a match depend not
4261                    only on the input position but also on how many (bar\s*)
4262                    repeats into the {4,8} we are. */
4263                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4264                     f &= ~SCF_WHILEM_VISITED_POS;
4265
4266                 /* This will finish on WHILEM, setting scan, or on NULL: */
4267                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4268                                       last, data, stopparen, recursed_depth, NULL,
4269                                       (mincount == 0
4270                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4271
4272                 if (flags & SCF_DO_STCLASS)
4273                     data->start_class = oclass;
4274                 if (mincount == 0 || minnext == 0) {
4275                     if (flags & SCF_DO_STCLASS_OR) {
4276                         ssc_or(pRExC_state, data->start_class, &this_class);
4277                     }
4278                     else if (flags & SCF_DO_STCLASS_AND) {
4279                         /* Switch to OR mode: cache the old value of
4280                          * data->start_class */
4281                         INIT_AND_WITHP;
4282                         StructCopy(data->start_class, and_withp, regnode_ssc);
4283                         flags &= ~SCF_DO_STCLASS_AND;
4284                         StructCopy(&this_class, data->start_class, regnode_ssc);
4285                         flags |= SCF_DO_STCLASS_OR;
4286                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4287                     }
4288                 } else {                /* Non-zero len */
4289                     if (flags & SCF_DO_STCLASS_OR) {
4290                         ssc_or(pRExC_state, data->start_class, &this_class);
4291                         ssc_and(pRExC_state, data->start_class, and_withp);
4292                     }
4293                     else if (flags & SCF_DO_STCLASS_AND)
4294                         ssc_and(pRExC_state, data->start_class, &this_class);
4295                     flags &= ~SCF_DO_STCLASS;
4296                 }
4297                 if (!scan)              /* It was not CURLYX, but CURLY. */
4298                     scan = next;
4299                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4300                     /* ? quantifier ok, except for (?{ ... }) */
4301                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4302                     && (minnext == 0) && (deltanext == 0)
4303                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4304                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
4305                 {
4306                     /* Fatal warnings may leak the regexp without this: */
4307                     SAVEFREESV(RExC_rx_sv);
4308                     ckWARNreg(RExC_parse,
4309                               "Quantifier unexpected on zero-length expression");
4310                     (void)ReREFCNT_inc(RExC_rx_sv);
4311                 }
4312
4313                 min += minnext * mincount;
4314                 is_inf_internal |= deltanext == SSize_t_MAX
4315                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
4316                 is_inf |= is_inf_internal;
4317                 if (is_inf)
4318                     delta = SSize_t_MAX;
4319                 else
4320                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
4321
4322                 /* Try powerful optimization CURLYX => CURLYN. */
4323                 if (  OP(oscan) == CURLYX && data
4324                       && data->flags & SF_IN_PAR
4325                       && !(data->flags & SF_HAS_EVAL)
4326                       && !deltanext && minnext == 1 ) {
4327                     /* Try to optimize to CURLYN.  */
4328                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4329                     regnode * const nxt1 = nxt;
4330 #ifdef DEBUGGING
4331                     regnode *nxt2;
4332 #endif
4333
4334                     /* Skip open. */
4335                     nxt = regnext(nxt);
4336                     if (!REGNODE_SIMPLE(OP(nxt))
4337                         && !(PL_regkind[OP(nxt)] == EXACT
4338                              && STR_LEN(nxt) == 1))
4339                         goto nogo;
4340 #ifdef DEBUGGING
4341                     nxt2 = nxt;
4342 #endif
4343                     nxt = regnext(nxt);
4344                     if (OP(nxt) != CLOSE)
4345                         goto nogo;
4346                     if (RExC_open_parens) {
4347                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4348                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4349                     }
4350                     /* Now we know that nxt2 is the only contents: */
4351                     oscan->flags = (U8)ARG(nxt);
4352                     OP(oscan) = CURLYN;
4353                     OP(nxt1) = NOTHING; /* was OPEN. */
4354
4355 #ifdef DEBUGGING
4356                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4357                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4358                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4359                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4360                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4361                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4362 #endif
4363                 }
4364               nogo:
4365
4366                 /* Try optimization CURLYX => CURLYM. */
4367                 if (  OP(oscan) == CURLYX && data
4368                       && !(data->flags & SF_HAS_PAR)
4369                       && !(data->flags & SF_HAS_EVAL)
4370                       && !deltanext     /* atom is fixed width */
4371                       && minnext != 0   /* CURLYM can't handle zero width */
4372                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4373                 ) {
4374                     /* XXXX How to optimize if data == 0? */
4375                     /* Optimize to a simpler form.  */
4376                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4377                     regnode *nxt2;
4378
4379                     OP(oscan) = CURLYM;
4380                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4381                             && (OP(nxt2) != WHILEM))
4382                         nxt = nxt2;
4383                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4384                     /* Need to optimize away parenths. */
4385                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4386                         /* Set the parenth number.  */
4387                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4388
4389                         oscan->flags = (U8)ARG(nxt);
4390                         if (RExC_open_parens) {
4391                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4392                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4393                         }
4394                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4395                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4396
4397 #ifdef DEBUGGING
4398                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4399                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4400                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4401                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4402 #endif
4403 #if 0
4404                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4405                             regnode *nnxt = regnext(nxt1);
4406                             if (nnxt == nxt) {
4407                                 if (reg_off_by_arg[OP(nxt1)])
4408                                     ARG_SET(nxt1, nxt2 - nxt1);
4409                                 else if (nxt2 - nxt1 < U16_MAX)
4410                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4411                                 else
4412                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4413                             }
4414                             nxt1 = nnxt;
4415                         }
4416 #endif
4417                         /* Optimize again: */
4418                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4419                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4420                     }
4421                     else
4422                         oscan->flags = 0;
4423                 }
4424                 else if ((OP(oscan) == CURLYX)
4425                          && (flags & SCF_WHILEM_VISITED_POS)
4426                          /* See the comment on a similar expression above.
4427                             However, this time it's not a subexpression
4428                             we care about, but the expression itself. */
4429                          && (maxcount == REG_INFTY)
4430                          && data && ++data->whilem_c < 16) {
4431                     /* This stays as CURLYX, we can put the count/of pair. */
4432                     /* Find WHILEM (as in regexec.c) */
4433                     regnode *nxt = oscan + NEXT_OFF(oscan);
4434
4435                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4436                         nxt += ARG(nxt);
4437                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4438                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4439                 }
4440                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4441                     pars++;
4442                 if (flags & SCF_DO_SUBSTR) {
4443                     SV *last_str = NULL;
4444                     int counted = mincount != 0;
4445
4446                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4447                         SSize_t b = pos_before >= data->last_start_min
4448                             ? pos_before : data->last_start_min;
4449                         STRLEN l;
4450                         const char * const s = SvPV_const(data->last_found, l);
4451                         SSize_t old = b - data->last_start_min;
4452
4453                         if (UTF)
4454                             old = utf8_hop((U8*)s, old) - (U8*)s;
4455                         l -= old;
4456                         /* Get the added string: */
4457                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4458                         if (deltanext == 0 && pos_before == b) {
4459                             /* What was added is a constant string */
4460                             if (mincount > 1) {
4461                                 SvGROW(last_str, (mincount * l) + 1);
4462                                 repeatcpy(SvPVX(last_str) + l,
4463                                           SvPVX_const(last_str), l, mincount - 1);
4464                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4465                                 /* Add additional parts. */
4466                                 SvCUR_set(data->last_found,
4467                                           SvCUR(data->last_found) - l);
4468                                 sv_catsv(data->last_found, last_str);
4469                                 {
4470                                     SV * sv = data->last_found;
4471                                     MAGIC *mg =
4472                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4473                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4474                                     if (mg && mg->mg_len >= 0)
4475                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4476                                 }
4477                                 data->last_end += l * (mincount - 1);
4478                             }
4479                         } else {
4480                             /* start offset must point into the last copy */
4481                             data->last_start_min += minnext * (mincount - 1);
4482                             data->last_start_max += is_inf ? SSize_t_MAX
4483                                 : (maxcount - 1) * (minnext + data->pos_delta);
4484                         }
4485                     }
4486                     /* It is counted once already... */
4487                     data->pos_min += minnext * (mincount - counted);
4488 #if 0
4489 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4490                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4491                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4492     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4493     (UV)mincount);
4494 if (deltanext != SSize_t_MAX)
4495 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4496     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4497           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4498 #endif
4499                     if (deltanext == SSize_t_MAX ||
4500                         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4501                         data->pos_delta = SSize_t_MAX;
4502                     else
4503                         data->pos_delta += - counted * deltanext +
4504                         (minnext + deltanext) * maxcount - minnext * mincount;
4505                     if (mincount != maxcount) {
4506                          /* Cannot extend fixed substrings found inside
4507                             the group.  */
4508                         SCAN_COMMIT(pRExC_state,data,minlenp);
4509                         if (mincount && last_str) {
4510                             SV * const sv = data->last_found;
4511                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4512                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4513
4514                             if (mg)
4515                                 mg->mg_len = -1;
4516                             sv_setsv(sv, last_str);
4517                             data->last_end = data->pos_min;
4518                             data->last_start_min =
4519                                 data->pos_min - CHR_SVLEN(last_str);
4520                             data->last_start_max = is_inf
4521                                 ? SSize_t_MAX
4522                                 : data->pos_min + data->pos_delta
4523                                 - CHR_SVLEN(last_str);
4524                         }
4525                         data->longest = &(data->longest_float);
4526                     }
4527                     SvREFCNT_dec(last_str);
4528                 }
4529                 if (data && (fl & SF_HAS_EVAL))
4530                     data->flags |= SF_HAS_EVAL;
4531               optimize_curly_tail:
4532                 if (OP(oscan) != CURLYX) {
4533                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4534                            && NEXT_OFF(next))
4535                         NEXT_OFF(oscan) += NEXT_OFF(next);
4536                 }
4537                 continue;
4538
4539             default:
4540 #ifdef DEBUGGING
4541                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4542                                                                     OP(scan));
4543 #endif
4544             case REF:
4545             case CLUMP:
4546                 if (flags & SCF_DO_SUBSTR) {
4547                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4548                     data->longest = &(data->longest_float);
4549                 }
4550                 is_inf = is_inf_internal = 1;
4551                 if (flags & SCF_DO_STCLASS_OR) {
4552                     if (OP(scan) == CLUMP) {
4553                         /* Actually is any start char, but very few code points
4554                          * aren't start characters */
4555                         ssc_match_all_cp(data->start_class);
4556                     }
4557                     else {
4558                         ssc_anything(data->start_class);
4559                     }
4560                 }
4561                 flags &= ~SCF_DO_STCLASS;
4562                 break;
4563             }
4564         }
4565         else if (OP(scan) == LNBREAK) {
4566             if (flags & SCF_DO_STCLASS) {
4567                 if (flags & SCF_DO_STCLASS_AND) {
4568                     ssc_intersection(data->start_class,
4569                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4570                     ssc_clear_locale(data->start_class);
4571                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4572                 }
4573                 else if (flags & SCF_DO_STCLASS_OR) {
4574                     ssc_union(data->start_class,
4575                               PL_XPosix_ptrs[_CC_VERTSPACE],
4576                               FALSE);
4577                     ssc_and(pRExC_state, data->start_class, and_withp);
4578
4579                     /* See commit msg for
4580                      * 749e076fceedeb708a624933726e7989f2302f6a */
4581                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4582                 }
4583                 flags &= ~SCF_DO_STCLASS;
4584             }
4585             min++;
4586             delta++;    /* Because of the 2 char string cr-lf */
4587             if (flags & SCF_DO_SUBSTR) {
4588                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4589                 data->pos_min += 1;
4590                 data->pos_delta += 1;
4591                 data->longest = &(data->longest_float);
4592             }
4593         }
4594         else if (REGNODE_SIMPLE(OP(scan))) {
4595
4596             if (flags & SCF_DO_SUBSTR) {
4597                 SCAN_COMMIT(pRExC_state,data,minlenp);
4598                 data->pos_min++;
4599             }
4600             min++;
4601             if (flags & SCF_DO_STCLASS) {
4602                 bool invert = 0;
4603                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4604                 U8 classnum;
4605                 U8 namedclass;
4606
4607                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4608                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4609
4610                 /* Some of the logic below assumes that switching
4611                    locale on will only add false positives. */
4612                 switch (OP(scan)) {
4613
4614                 default:
4615 #ifdef DEBUGGING
4616                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4617 #endif
4618                 case CANY:
4619                 case SANY:
4620                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4621                         ssc_match_all_cp(data->start_class);
4622                     break;
4623
4624                 case REG_ANY:
4625                     {
4626                         SV* REG_ANY_invlist = _new_invlist(2);
4627                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4628                                                             '\n');
4629                         if (flags & SCF_DO_STCLASS_OR) {
4630                             ssc_union(data->start_class,
4631                                       REG_ANY_invlist,
4632                                       TRUE /* TRUE => invert, hence all but \n
4633                                             */
4634                                       );
4635                         }
4636                         else if (flags & SCF_DO_STCLASS_AND) {
4637                             ssc_intersection(data->start_class,
4638                                              REG_ANY_invlist,
4639                                              TRUE  /* TRUE => invert */
4640                                              );
4641                             ssc_clear_locale(data->start_class);
4642                         }
4643                         SvREFCNT_dec_NN(REG_ANY_invlist);
4644                     }
4645                     break;
4646
4647                 case ANYOF:
4648                     if (flags & SCF_DO_STCLASS_AND)
4649                         ssc_and(pRExC_state, data->start_class,
4650                                 (regnode_ssc*) scan);
4651                     else
4652                         ssc_or(pRExC_state, data->start_class,
4653                                                           (regnode_ssc*)scan);
4654                     break;
4655
4656                 case NPOSIXL:
4657                     invert = 1;
4658                     /* FALL THROUGH */
4659
4660                 case POSIXL:
4661                     classnum = FLAGS(scan);
4662                     namedclass = classnum_to_namedclass(classnum) + invert;
4663                     if (flags & SCF_DO_STCLASS_AND) {
4664                         bool was_there = cBOOL(
4665                                           ANYOF_POSIXL_TEST(data->start_class,
4666                                                                  namedclass));
4667                         ANYOF_POSIXL_ZERO(data->start_class);
4668                         if (was_there) {    /* Do an AND */
4669                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4670                         }
4671                         /* No individual code points can now match */
4672                         data->start_class->invlist
4673                                                 = sv_2mortal(_new_invlist(0));
4674                     }
4675                     else {
4676                         int complement = namedclass + ((invert) ? -1 : 1);
4677
4678                         assert(flags & SCF_DO_STCLASS_OR);
4679
4680                         /* If the complement of this class was already there,
4681                          * the result is that they match all code points,
4682                          * (\d + \D == everything).  Remove the classes from
4683                          * future consideration.  Locale is not relevant in
4684                          * this case */
4685                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4686                             ssc_match_all_cp(data->start_class);
4687                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4688                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4689                             if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4690                             {
4691                                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4692                             }
4693                         }
4694                         else {  /* The usual case; just add this class to the
4695                                    existing set */
4696                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4697                             ANYOF_FLAGS(data->start_class)
4698                                                 |= ANYOF_LOCALE|ANYOF_POSIXL;
4699                         }
4700                     }
4701                     break;
4702
4703                 case NPOSIXA:   /* For these, we always know the exact set of
4704                                    what's matched */
4705                     invert = 1;
4706                     /* FALL THROUGH */
4707                 case POSIXA:
4708                     classnum = FLAGS(scan);
4709                     my_invlist = PL_Posix_ptrs[classnum];
4710                     goto join_posix;
4711
4712                 case NPOSIXD:
4713                 case NPOSIXU:
4714                     invert = 1;
4715                     /* FALL THROUGH */
4716                 case POSIXD:
4717                 case POSIXU:
4718                     classnum = FLAGS(scan);
4719
4720                     /* If we know all the code points that match the class, use
4721                      * that; otherwise use the Latin1 code points, plus we have
4722                      * to assume that it could match anything above Latin1 */
4723                     if (PL_XPosix_ptrs[classnum]) {
4724                         my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4725                     }
4726                     else {
4727                         _invlist_union(PL_L1Posix_ptrs[classnum],
4728                                        PL_AboveLatin1, &my_invlist);
4729                     }
4730
4731                     /* NPOSIXD matches all upper Latin1 code points unless the
4732                      * target string being matched is UTF-8, which is
4733                      * unknowable until match time */
4734                     if (PL_regkind[OP(scan)] == NPOSIXD) {
4735                         _invlist_union_complement_2nd(my_invlist,
4736                                         PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4737                     }
4738
4739                   join_posix:
4740
4741                     if (flags & SCF_DO_STCLASS_AND) {
4742                         ssc_intersection(data->start_class, my_invlist, invert);
4743                         ssc_clear_locale(data->start_class);
4744                     }
4745                     else {
4746                         assert(flags & SCF_DO_STCLASS_OR);
4747                         ssc_union(data->start_class, my_invlist, invert);
4748                     }
4749                 }
4750                 if (flags & SCF_DO_STCLASS_OR)
4751                     ssc_and(pRExC_state, data->start_class, and_withp);
4752                 flags &= ~SCF_DO_STCLASS;
4753             }
4754         }
4755         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4756             data->flags |= (OP(scan) == MEOL
4757                             ? SF_BEFORE_MEOL
4758                             : SF_BEFORE_SEOL);
4759             SCAN_COMMIT(pRExC_state, data, minlenp);
4760
4761         }
4762         else if (  PL_regkind[OP(scan)] == BRANCHJ
4763                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4764                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4765                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4766             if ( OP(scan) == UNLESSM &&
4767                  scan->flags == 0 &&
4768                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4769                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4770             ) {
4771                 regnode *opt;
4772                 regnode *upto= regnext(scan);
4773                 DEBUG_PARSE_r({
4774                     SV * const mysv_val=sv_newmortal();
4775                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4776
4777                     /*DEBUG_PARSE_MSG("opfail");*/
4778                     regprop(RExC_rx, mysv_val, upto);
4779                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4780                                   SvPV_nolen_const(mysv_val),
4781                                   (IV)REG_NODE_NUM(upto),
4782                                   (IV)(upto - scan)
4783                     );
4784                 });
4785                 OP(scan) = OPFAIL;
4786                 NEXT_OFF(scan) = upto - scan;
4787                 for (opt= scan + 1; opt < upto ; opt++)
4788                     OP(opt) = OPTIMIZED;
4789                 scan= upto;
4790                 continue;
4791             }
4792             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4793                 || OP(scan) == UNLESSM )
4794             {
4795                 /* Negative Lookahead/lookbehind
4796                    In this case we can't do fixed string optimisation.
4797                 */
4798
4799                 SSize_t deltanext, minnext, fake = 0;
4800                 regnode *nscan;
4801                 regnode_ssc intrnl;
4802                 int f = 0;
4803
4804                 data_fake.flags = 0;
4805                 if (data) {
4806                     data_fake.whilem_c = data->whilem_c;
4807                     data_fake.last_closep = data->last_closep;
4808                 }
4809                 else
4810                     data_fake.last_closep = &fake;
4811                 data_fake.pos_delta = delta;
4812                 if ( flags & SCF_DO_STCLASS && !scan->flags
4813                      && OP(scan) == IFMATCH ) { /* Lookahead */
4814                     ssc_init(pRExC_state, &intrnl);
4815                     data_fake.start_class = &intrnl;
4816                     f |= SCF_DO_STCLASS_AND;
4817                 }
4818                 if (flags & SCF_WHILEM_VISITED_POS)
4819                     f |= SCF_WHILEM_VISITED_POS;
4820                 next = regnext(scan);
4821                 nscan = NEXTOPER(NEXTOPER(scan));
4822                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4823                     last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1);
4824                 if (scan->flags) {
4825                     if (deltanext) {
4826                         FAIL("Variable length lookbehind not implemented");
4827                     }
4828                     else if (minnext > (I32)U8_MAX) {
4829                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4830                     }
4831                     scan->flags = (U8)minnext;
4832                 }
4833                 if (data) {
4834                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4835                         pars++;
4836                     if (data_fake.flags & SF_HAS_EVAL)
4837                         data->flags |= SF_HAS_EVAL;
4838                     data->whilem_c = data_fake.whilem_c;
4839                 }
4840                 if (f & SCF_DO_STCLASS_AND) {
4841                     if (flags & SCF_DO_STCLASS_OR) {
4842                         /* OR before, AND after: ideally we would recurse with
4843                          * data_fake to get the AND applied by study of the
4844                          * remainder of the pattern, and then derecurse;
4845                          * *** HACK *** for now just treat as "no information".
4846                          * See [perl #56690].
4847                          */
4848                         ssc_init(pRExC_state, data->start_class);
4849                     }  else {
4850                         /* AND before and after: combine and continue */
4851                         ssc_and(pRExC_state, data->start_class, &intrnl);
4852                     }
4853                 }
4854             }
4855 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4856             else {
4857                 /* Positive Lookahead/lookbehind
4858                    In this case we can do fixed string optimisation,
4859                    but we must be careful about it. Note in the case of
4860                    lookbehind the positions will be offset by the minimum
4861                    length of the pattern, something we won't know about
4862                    until after the recurse.
4863                 */
4864                 SSize_t deltanext, fake = 0;
4865                 regnode *nscan;
4866                 regnode_ssc intrnl;
4867                 int f = 0;
4868                 /* We use SAVEFREEPV so that when the full compile
4869                     is finished perl will clean up the allocated
4870                     minlens when it's all done. This way we don't
4871                     have to worry about freeing them when we know
4872                     they wont be used, which would be a pain.
4873                  */
4874                 SSize_t *minnextp;
4875                 Newx( minnextp, 1, SSize_t );
4876                 SAVEFREEPV(minnextp);
4877
4878                 if (data) {
4879                     StructCopy(data, &data_fake, scan_data_t);
4880                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4881                         f |= SCF_DO_SUBSTR;
4882                         if (scan->flags)
4883                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4884                         data_fake.last_found=newSVsv(data->last_found);
4885                     }
4886                 }
4887                 else
4888                     data_fake.last_closep = &fake;
4889                 data_fake.flags = 0;
4890                 data_fake.pos_delta = delta;
4891                 if (is_inf)
4892                     data_fake.flags |= SF_IS_INF;
4893                 if ( flags & SCF_DO_STCLASS && !scan->flags
4894                      && OP(scan) == IFMATCH ) { /* Lookahead */
4895                     ssc_init(pRExC_state, &intrnl);
4896                     data_fake.start_class = &intrnl;
4897                     f |= SCF_DO_STCLASS_AND;
4898                 }
4899                 if (flags & SCF_WHILEM_VISITED_POS)
4900                     f |= SCF_WHILEM_VISITED_POS;
4901                 next = regnext(scan);
4902                 nscan = NEXTOPER(NEXTOPER(scan));
4903
4904                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4905                     last, &data_fake, stopparen, recursed_depth, NULL, f,depth+1);
4906                 if (scan->flags) {
4907                     if (deltanext) {
4908                         FAIL("Variable length lookbehind not implemented");
4909                     }
4910                     else if (*minnextp > (I32)U8_MAX) {
4911                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4912                     }
4913                     scan->flags = (U8)*minnextp;
4914                 }
4915
4916                 *minnextp += min;
4917
4918                 if (f & SCF_DO_STCLASS_AND) {
4919                     ssc_and(pRExC_state, data->start_class, &intrnl);
4920                 }
4921                 if (data) {
4922                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4923                         pars++;
4924                     if (data_fake.flags & SF_HAS_EVAL)
4925                         data->flags |= SF_HAS_EVAL;
4926                     data->whilem_c = data_fake.whilem_c;
4927                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4928                         if (RExC_rx->minlen<*minnextp)
4929                             RExC_rx->minlen=*minnextp;
4930                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4931                         SvREFCNT_dec_NN(data_fake.last_found);
4932
4933                         if ( data_fake.minlen_fixed != minlenp )
4934                         {
4935                             data->offset_fixed= data_fake.offset_fixed;
4936                             data->minlen_fixed= data_fake.minlen_fixed;
4937                             data->lookbehind_fixed+= scan->flags;
4938                         }
4939                         if ( data_fake.minlen_float != minlenp )
4940                         {
4941                             data->minlen_float= data_fake.minlen_float;
4942                             data->offset_float_min=data_fake.offset_float_min;
4943                             data->offset_float_max=data_fake.offset_float_max;
4944                             data->lookbehind_float+= scan->flags;
4945                         }
4946                     }
4947                 }
4948             }
4949 #endif
4950         }
4951         else if (OP(scan) == OPEN) {
4952             if (stopparen != (I32)ARG(scan))
4953                 pars++;
4954         }
4955         else if (OP(scan) == CLOSE) {
4956             if (stopparen == (I32)ARG(scan)) {
4957                 break;
4958             }
4959             if ((I32)ARG(scan) == is_par) {
4960                 next = regnext(scan);
4961
4962                 if ( next && (OP(next) != WHILEM) && next < last)
4963                     is_par = 0;         /* Disable optimization */
4964             }
4965             if (data)
4966                 *(data->last_closep) = ARG(scan);
4967         }
4968         else if (OP(scan) == EVAL) {
4969                 if (data)
4970                     data->flags |= SF_HAS_EVAL;
4971         }
4972         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4973             if (flags & SCF_DO_SUBSTR) {
4974                 SCAN_COMMIT(pRExC_state,data,minlenp);
4975                 flags &= ~SCF_DO_SUBSTR;
4976             }
4977             if (data && OP(scan)==ACCEPT) {
4978                 data->flags |= SCF_SEEN_ACCEPT;
4979                 if (stopmin > min)
4980                     stopmin = min;
4981             }
4982         }
4983         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4984         {
4985                 if (flags & SCF_DO_SUBSTR) {
4986                     SCAN_COMMIT(pRExC_state,data,minlenp);
4987                     data->longest = &(data->longest_float);
4988                 }
4989                 is_inf = is_inf_internal = 1;
4990                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4991                     ssc_anything(data->start_class);
4992                 flags &= ~SCF_DO_STCLASS;
4993         }
4994         else if (OP(scan) == GPOS) {
4995             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4996                 !(delta || is_inf || (data && data->pos_delta)))
4997             {
4998                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4999                     RExC_rx->extflags |= RXf_ANCH_GPOS;
5000                 if (RExC_rx->gofs < (STRLEN)min)
5001                     RExC_rx->gofs = min;
5002             } else {
5003                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
5004                 RExC_rx->gofs = 0;
5005             }
5006         }
5007 #ifdef TRIE_STUDY_OPT
5008 #ifdef FULL_TRIE_STUDY
5009         else if (PL_regkind[OP(scan)] == TRIE) {
5010             /* NOTE - There is similar code to this block above for handling
5011                BRANCH nodes on the initial study.  If you change stuff here
5012                check there too. */
5013             regnode *trie_node= scan;
5014             regnode *tail= regnext(scan);
5015             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5016             SSize_t max1 = 0, min1 = SSize_t_MAX;
5017             regnode_ssc accum;
5018
5019             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
5020                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
5021             if (flags & SCF_DO_STCLASS)
5022                 ssc_init_zero(pRExC_state, &accum);
5023
5024             if (!trie->jump) {
5025                 min1= trie->minlen;
5026                 max1= trie->maxlen;
5027             } else {
5028                 const regnode *nextbranch= NULL;
5029                 U32 word;
5030
5031                 for ( word=1 ; word <= trie->wordcount ; word++)
5032                 {
5033                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5034                     regnode_ssc this_class;
5035
5036                     data_fake.flags = 0;
5037                     if (data) {
5038                         data_fake.whilem_c = data->whilem_c;
5039                         data_fake.last_closep = data->last_closep;
5040                     }
5041                     else
5042                         data_fake.last_closep = &fake;
5043                     data_fake.pos_delta = delta;
5044                     if (flags & SCF_DO_STCLASS) {
5045                         ssc_init(pRExC_state, &this_class);
5046                         data_fake.start_class = &this_class;
5047                         f = SCF_DO_STCLASS_AND;
5048                     }
5049                     if (flags & SCF_WHILEM_VISITED_POS)
5050                         f |= SCF_WHILEM_VISITED_POS;
5051
5052                     if (trie->jump[word]) {
5053                         if (!nextbranch)
5054                             nextbranch = trie_node + trie->jump[0];
5055                         scan= trie_node + trie->jump[word];
5056                         /* We go from the jump point to the branch that follows
5057                            it. Note this means we need the vestigal unused branches
5058                            even though they arent otherwise used.
5059                          */
5060                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5061                             &deltanext, (regnode *)nextbranch, &data_fake,
5062                             stopparen, recursed_depth, NULL, f,depth+1);
5063                     }
5064                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5065                         nextbranch= regnext((regnode*)nextbranch);
5066
5067                     if (min1 > (SSize_t)(minnext + trie->minlen))
5068                         min1 = minnext + trie->minlen;
5069                     if (deltanext == SSize_t_MAX) {
5070                         is_inf = is_inf_internal = 1;
5071                         max1 = SSize_t_MAX;
5072                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5073                         max1 = minnext + deltanext + trie->maxlen;
5074
5075                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5076                         pars++;
5077                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5078                         if ( stopmin > min + min1)
5079                             stopmin = min + min1;
5080                         flags &= ~SCF_DO_SUBSTR;
5081                         if (data)
5082                             data->flags |= SCF_SEEN_ACCEPT;
5083                     }
5084                     if (data) {
5085                         if (data_fake.flags & SF_HAS_EVAL)
5086                             data->flags |= SF_HAS_EVAL;
5087                         data->whilem_c = data_fake.whilem_c;
5088                     }
5089                     if (flags & SCF_DO_STCLASS)
5090                         ssc_or(pRExC_state, &accum, &this_class);
5091                 }
5092             }
5093             if (flags & SCF_DO_SUBSTR) {
5094                 data->pos_min += min1;
5095                 data->pos_delta += max1 - min1;
5096                 if (max1 != min1 || is_inf)
5097                     data->longest = &(data->longest_float);
5098             }
5099             min += min1;
5100             delta += max1 - min1;
5101             if (flags & SCF_DO_STCLASS_OR) {
5102                 ssc_or(pRExC_state, data->start_class, &accum);
5103                 if (min1) {
5104                     ssc_and(pRExC_state, data->start_class, and_withp);
5105                     flags &= ~SCF_DO_STCLASS;
5106                 }
5107             }
5108             else if (flags & SCF_DO_STCLASS_AND) {
5109                 if (min1) {
5110                     ssc_and(pRExC_state, data->start_class, &accum);
5111                     flags &= ~SCF_DO_STCLASS;
5112                 }
5113                 else {
5114                     /* Switch to OR mode: cache the old value of
5115                      * data->start_class */
5116                     INIT_AND_WITHP;
5117                     StructCopy(data->start_class, and_withp, regnode_ssc);
5118                     flags &= ~SCF_DO_STCLASS_AND;
5119                     StructCopy(&accum, data->start_class, regnode_ssc);
5120                     flags |= SCF_DO_STCLASS_OR;
5121                 }
5122             }
5123             scan= tail;
5124             continue;
5125         }
5126 #else
5127         else if (PL_regkind[OP(scan)] == TRIE) {
5128             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5129             U8*bang=NULL;
5130
5131             min += trie->minlen;
5132             delta += (trie->maxlen - trie->minlen);
5133             flags &= ~SCF_DO_STCLASS; /* xxx */
5134             if (flags & SCF_DO_SUBSTR) {
5135                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
5136                 data->pos_min += trie->minlen;
5137                 data->pos_delta += (trie->maxlen - trie->minlen);
5138                 if (trie->maxlen != trie->minlen)
5139                     data->longest = &(data->longest_float);
5140             }
5141             if (trie->jump) /* no more substrings -- for now /grr*/
5142                 flags &= ~SCF_DO_SUBSTR;
5143         }
5144 #endif /* old or new */
5145 #endif /* TRIE_STUDY_OPT */
5146
5147         /* Else: zero-length, ignore. */
5148         scan = regnext(scan);
5149     }
5150     /* If we are exiting a recursion we can unset its recursed bit
5151      * and allow ourselves to enter it again - no danger of an
5152      * infinite loop there.
5153     if (stopparen > -1 && recursed) {
5154         DEBUG_STUDYDATA("unset:", data,depth);
5155         PAREN_UNSET( recursed, stopparen);
5156     }
5157     */
5158     if (frame) {
5159         DEBUG_STUDYDATA("frame-end:",data,depth);
5160         DEBUG_PEEP("fend", scan, depth);
5161         /* restore previous context */
5162         last = frame->last;
5163         scan = frame->next;
5164         stopparen = frame->stop;
5165         recursed_depth = frame->prev_recursed_depth;
5166         depth = depth - 1;
5167
5168         frame = frame->prev;
5169         goto fake_study_recurse;
5170     }
5171
5172   finish:
5173     assert(!frame);
5174     DEBUG_STUDYDATA("pre-fin:",data,depth);
5175
5176     *scanp = scan;
5177     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5178     if (flags & SCF_DO_SUBSTR && is_inf)
5179         data->pos_delta = SSize_t_MAX - data->pos_min;
5180     if (is_par > (I32)U8_MAX)
5181         is_par = 0;
5182     if (is_par && pars==1 && data) {
5183         data->flags |= SF_IN_PAR;
5184         data->flags &= ~SF_HAS_PAR;
5185     }
5186     else if (pars && data) {
5187         data->flags |= SF_HAS_PAR;
5188         data->flags &= ~SF_IN_PAR;
5189     }
5190     if (flags & SCF_DO_STCLASS_OR)
5191         ssc_and(pRExC_state, data->start_class, and_withp);
5192     if (flags & SCF_TRIE_RESTUDY)
5193         data->flags |=  SCF_TRIE_RESTUDY;
5194
5195     DEBUG_STUDYDATA("post-fin:",data,depth);
5196
5197     return min < stopmin ? min : stopmin;
5198 }
5199
5200 STATIC U32
5201 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5202 {
5203     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5204
5205     PERL_ARGS_ASSERT_ADD_DATA;
5206
5207     Renewc(RExC_rxi->data,
5208            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5209            char, struct reg_data);
5210     if(count)
5211         Renew(RExC_rxi->data->what, count + n, U8);
5212     else
5213         Newx(RExC_rxi->data->what, n, U8);
5214     RExC_rxi->data->count = count + n;
5215     Copy(s, RExC_rxi->data->what + count, n, U8);
5216     return count;
5217 }
5218
5219 /*XXX: todo make this not included in a non debugging perl */
5220 #ifndef PERL_IN_XSUB_RE
5221 void
5222 Perl_reginitcolors(pTHX)
5223 {
5224     dVAR;
5225     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5226     if (s) {
5227         char *t = savepv(s);
5228         int i = 0;
5229         PL_colors[0] = t;
5230         while (++i < 6) {
5231             t = strchr(t, '\t');
5232             if (t) {
5233                 *t = '\0';
5234                 PL_colors[i] = ++t;
5235             }
5236             else
5237                 PL_colors[i] = t = (char *)"";
5238         }
5239     } else {
5240         int i = 0;
5241         while (i < 6)
5242             PL_colors[i++] = (char *)"";
5243     }
5244     PL_colorset = 1;
5245 }
5246 #endif
5247
5248
5249 #ifdef TRIE_STUDY_OPT
5250 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5251     STMT_START {                                            \
5252         if (                                                \
5253               (data.flags & SCF_TRIE_RESTUDY)               \
5254               && ! restudied++                              \
5255         ) {                                                 \
5256             dOsomething;                                    \
5257             goto reStudy;                                   \
5258         }                                                   \
5259     } STMT_END
5260 #else
5261 #define CHECK_RESTUDY_GOTO_butfirst
5262 #endif
5263
5264 /*
5265  * pregcomp - compile a regular expression into internal code
5266  *
5267  * Decides which engine's compiler to call based on the hint currently in
5268  * scope
5269  */
5270
5271 #ifndef PERL_IN_XSUB_RE
5272
5273 /* return the currently in-scope regex engine (or the default if none)  */
5274
5275 regexp_engine const *
5276 Perl_current_re_engine(pTHX)
5277 {
5278     dVAR;
5279
5280     if (IN_PERL_COMPILETIME) {
5281         HV * const table = GvHV(PL_hintgv);
5282         SV **ptr;
5283
5284         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5285             return &PL_core_reg_engine;
5286         ptr = hv_fetchs(table, "regcomp", FALSE);
5287         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5288             return &PL_core_reg_engine;
5289         return INT2PTR(regexp_engine*,SvIV(*ptr));
5290     }
5291     else {
5292         SV *ptr;
5293         if (!PL_curcop->cop_hints_hash)
5294             return &PL_core_reg_engine;
5295         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5296         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5297             return &PL_core_reg_engine;
5298         return INT2PTR(regexp_engine*,SvIV(ptr));
5299     }
5300 }
5301
5302
5303 REGEXP *
5304 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5305 {
5306     dVAR;
5307     regexp_engine const *eng = current_re_engine();
5308     GET_RE_DEBUG_FLAGS_DECL;
5309
5310     PERL_ARGS_ASSERT_PREGCOMP;
5311
5312     /* Dispatch a request to compile a regexp to correct regexp engine. */
5313     DEBUG_COMPILE_r({
5314         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5315                         PTR2UV(eng));
5316     });
5317     return CALLREGCOMP_ENG(eng, pattern, flags);
5318 }
5319 #endif
5320
5321 /* public(ish) entry point for the perl core's own regex compiling code.
5322  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5323  * pattern rather than a list of OPs, and uses the internal engine rather
5324  * than the current one */
5325
5326 REGEXP *
5327 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5328 {
5329     SV *pat = pattern; /* defeat constness! */
5330     PERL_ARGS_ASSERT_RE_COMPILE;
5331     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5332 #ifdef PERL_IN_XSUB_RE
5333                                 &my_reg_engine,
5334 #else
5335                                 &PL_core_reg_engine,
5336 #endif
5337                                 NULL, NULL, rx_flags, 0);
5338 }
5339
5340
5341 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5342  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5343  * point to the realloced string and length.
5344  *
5345  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5346  * stuff added */
5347
5348 static void
5349 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5350                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5351 {
5352     U8 *const src = (U8*)*pat_p;
5353     U8 *dst;
5354     int n=0;
5355     STRLEN s = 0, d = 0;
5356     bool do_end = 0;
5357     GET_RE_DEBUG_FLAGS_DECL;
5358
5359     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5360         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5361
5362     Newx(dst, *plen_p * 2 + 1, U8);
5363
5364     while (s < *plen_p) {
5365         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5366             dst[d]   = src[s];
5367         else {
5368             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5369             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5370         }
5371         if (n < num_code_blocks) {
5372             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5373                 pRExC_state->code_blocks[n].start = d;
5374                 assert(dst[d] == '(');
5375                 do_end = 1;
5376             }
5377             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5378                 pRExC_state->code_blocks[n].end = d;
5379                 assert(dst[d] == ')');
5380                 do_end = 0;
5381                 n++;
5382             }
5383         }
5384         s++;
5385         d++;
5386     }
5387     dst[d] = '\0';
5388     *plen_p = d;
5389     *pat_p = (char*) dst;
5390     SAVEFREEPV(*pat_p);
5391     RExC_orig_utf8 = RExC_utf8 = 1;
5392 }
5393
5394
5395
5396 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5397  * while recording any code block indices, and handling overloading,
5398  * nested qr// objects etc.  If pat is null, it will allocate a new
5399  * string, or just return the first arg, if there's only one.
5400  *
5401  * Returns the malloced/updated pat.
5402  * patternp and pat_count is the array of SVs to be concatted;
5403  * oplist is the optional list of ops that generated the SVs;
5404  * recompile_p is a pointer to a boolean that will be set if
5405  *   the regex will need to be recompiled.
5406  * delim, if non-null is an SV that will be inserted between each element
5407  */
5408
5409 static SV*
5410 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5411                 SV *pat, SV ** const patternp, int pat_count,
5412                 OP *oplist, bool *recompile_p, SV *delim)
5413 {
5414     SV **svp;
5415     int n = 0;
5416     bool use_delim = FALSE;
5417     bool alloced = FALSE;
5418
5419     /* if we know we have at least two args, create an empty string,
5420      * then concatenate args to that. For no args, return an empty string */
5421     if (!pat && pat_count != 1) {
5422         pat = newSVpvn("", 0);
5423         SAVEFREESV(pat);
5424         alloced = TRUE;
5425     }
5426
5427     for (svp = patternp; svp < patternp + pat_count; svp++) {
5428         SV *sv;
5429         SV *rx  = NULL;
5430         STRLEN orig_patlen = 0;
5431         bool code = 0;
5432         SV *msv = use_delim ? delim : *svp;
5433         if (!msv) msv = &PL_sv_undef;
5434
5435         /* if we've got a delimiter, we go round the loop twice for each
5436          * svp slot (except the last), using the delimiter the second
5437          * time round */
5438         if (use_delim) {
5439             svp--;
5440             use_delim = FALSE;
5441         }
5442         else if (delim)
5443             use_delim = TRUE;
5444
5445         if (SvTYPE(msv) == SVt_PVAV) {
5446             /* we've encountered an interpolated array within
5447              * the pattern, e.g. /...@a..../. Expand the list of elements,
5448              * then recursively append elements.
5449              * The code in this block is based on S_pushav() */
5450
5451             AV *const av = (AV*)msv;
5452             const SSize_t maxarg = AvFILL(av) + 1;
5453             SV **array;
5454
5455             if (oplist) {
5456                 assert(oplist->op_type == OP_PADAV
5457                     || oplist->op_type == OP_RV2AV);
5458                 oplist = oplist->op_sibling;;
5459             }
5460
5461             if (SvRMAGICAL(av)) {
5462                 SSize_t i;
5463
5464                 Newx(array, maxarg, SV*);
5465                 SAVEFREEPV(array);
5466                 for (i=0; i < maxarg; i++) {
5467                     SV ** const svp = av_fetch(av, i, FALSE);
5468                     array[i] = svp ? *svp : &PL_sv_undef;
5469                 }
5470             }
5471             else
5472                 array = AvARRAY(av);
5473
5474             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5475                                 array, maxarg, NULL, recompile_p,
5476                                 /* $" */
5477                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5478
5479             continue;
5480         }
5481
5482
5483         /* we make the assumption here that each op in the list of
5484          * op_siblings maps to one SV pushed onto the stack,
5485          * except for code blocks, with have both an OP_NULL and
5486          * and OP_CONST.
5487          * This allows us to match up the list of SVs against the
5488          * list of OPs to find the next code block.
5489          *
5490          * Note that       PUSHMARK PADSV PADSV ..
5491          * is optimised to
5492          *                 PADRANGE PADSV  PADSV  ..
5493          * so the alignment still works. */
5494
5495         if (oplist) {
5496             if (oplist->op_type == OP_NULL
5497                 && (oplist->op_flags & OPf_SPECIAL))
5498             {
5499                 assert(n < pRExC_state->num_code_blocks);
5500                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5501                 pRExC_state->code_blocks[n].block = oplist;
5502                 pRExC_state->code_blocks[n].src_regex = NULL;
5503                 n++;
5504                 code = 1;
5505                 oplist = oplist->op_sibling; /* skip CONST */
5506                 assert(oplist);
5507             }
5508             oplist = oplist->op_sibling;;
5509         }
5510
5511         /* apply magic and QR overloading to arg */
5512
5513         SvGETMAGIC(msv);
5514         if (SvROK(msv) && SvAMAGIC(msv)) {
5515             SV *sv = AMG_CALLunary(msv, regexp_amg);
5516             if (sv) {
5517                 if (SvROK(sv))
5518                     sv = SvRV(sv);
5519                 if (SvTYPE(sv) != SVt_REGEXP)
5520                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5521                 msv = sv;
5522             }
5523         }
5524
5525         /* try concatenation overload ... */
5526         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5527                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5528         {
5529             sv_setsv(pat, sv);
5530             /* overloading involved: all bets are off over literal
5531              * code. Pretend we haven't seen it */
5532             pRExC_state->num_code_blocks -= n;
5533             n = 0;
5534         }
5535         else  {
5536             /* ... or failing that, try "" overload */
5537             while (SvAMAGIC(msv)
5538                     && (sv = AMG_CALLunary(msv, string_amg))
5539                     && sv != msv
5540                     &&  !(   SvROK(msv)
5541                           && SvROK(sv)
5542                           && SvRV(msv) == SvRV(sv))
5543             ) {
5544                 msv = sv;
5545                 SvGETMAGIC(msv);
5546             }
5547             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5548                 msv = SvRV(msv);
5549
5550             if (pat) {
5551                 /* this is a partially unrolled
5552                  *     sv_catsv_nomg(pat, msv);
5553                  * that allows us to adjust code block indices if
5554                  * needed */
5555                 STRLEN dlen;
5556                 char *dst = SvPV_force_nomg(pat, dlen);
5557                 orig_patlen = dlen;
5558                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5559                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5560                     sv_setpvn(pat, dst, dlen);
5561                     SvUTF8_on(pat);
5562                 }
5563                 sv_catsv_nomg(pat, msv);
5564                 rx = msv;
5565             }
5566             else
5567                 pat = msv;
5568
5569             if (code)
5570                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5571         }
5572
5573         /* extract any code blocks within any embedded qr//'s */
5574         if (rx && SvTYPE(rx) == SVt_REGEXP
5575             && RX_ENGINE((REGEXP*)rx)->op_comp)
5576         {
5577
5578             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5579             if (ri->num_code_blocks) {
5580                 int i;
5581                 /* the presence of an embedded qr// with code means
5582                  * we should always recompile: the text of the
5583                  * qr// may not have changed, but it may be a
5584                  * different closure than last time */
5585                 *recompile_p = 1;
5586                 Renew(pRExC_state->code_blocks,
5587                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5588                     struct reg_code_block);
5589                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5590
5591                 for (i=0; i < ri->num_code_blocks; i++) {
5592                     struct reg_code_block *src, *dst;
5593                     STRLEN offset =  orig_patlen
5594                         + ReANY((REGEXP *)rx)->pre_prefix;
5595                     assert(n < pRExC_state->num_code_blocks);
5596                     src = &ri->code_blocks[i];
5597                     dst = &pRExC_state->code_blocks[n];
5598                     dst->start      = src->start + offset;
5599                     dst->end        = src->end   + offset;
5600                     dst->block      = src->block;
5601                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5602                                             src->src_regex
5603                                                 ? src->src_regex
5604                                                 : (REGEXP*)rx);
5605                     n++;
5606                 }
5607             }
5608         }
5609     }
5610     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5611     if (alloced)
5612         SvSETMAGIC(pat);
5613
5614     return pat;
5615 }
5616
5617
5618
5619 /* see if there are any run-time code blocks in the pattern.
5620  * False positives are allowed */
5621
5622 static bool
5623 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5624                     char *pat, STRLEN plen)
5625 {
5626     int n = 0;
5627     STRLEN s;
5628
5629     for (s = 0; s < plen; s++) {
5630         if (n < pRExC_state->num_code_blocks
5631             && s == pRExC_state->code_blocks[n].start)
5632         {
5633             s = pRExC_state->code_blocks[n].end;
5634             n++;
5635             continue;
5636         }
5637         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5638          * positives here */
5639         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5640             (pat[s+2] == '{'
5641                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5642         )
5643             return 1;
5644     }
5645     return 0;
5646 }
5647
5648 /* Handle run-time code blocks. We will already have compiled any direct
5649  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5650  * copy of it, but with any literal code blocks blanked out and
5651  * appropriate chars escaped; then feed it into
5652  *
5653  *    eval "qr'modified_pattern'"
5654  *
5655  * For example,
5656  *
5657  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5658  *
5659  * becomes
5660  *
5661  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5662  *
5663  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5664  * and merge them with any code blocks of the original regexp.
5665  *
5666  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5667  * instead, just save the qr and return FALSE; this tells our caller that
5668  * the original pattern needs upgrading to utf8.
5669  */
5670
5671 static bool
5672 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5673     char *pat, STRLEN plen)
5674 {
5675     SV *qr;
5676
5677     GET_RE_DEBUG_FLAGS_DECL;
5678
5679     if (pRExC_state->runtime_code_qr) {
5680         /* this is the second time we've been called; this should
5681          * only happen if the main pattern got upgraded to utf8
5682          * during compilation; re-use the qr we compiled first time
5683          * round (which should be utf8 too)
5684          */
5685         qr = pRExC_state->runtime_code_qr;
5686         pRExC_state->runtime_code_qr = NULL;
5687         assert(RExC_utf8 && SvUTF8(qr));
5688     }
5689     else {
5690         int n = 0;
5691         STRLEN s;
5692         char *p, *newpat;
5693         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5694         SV *sv, *qr_ref;
5695         dSP;
5696
5697         /* determine how many extra chars we need for ' and \ escaping */
5698         for (s = 0; s < plen; s++) {
5699             if (pat[s] == '\'' || pat[s] == '\\')
5700                 newlen++;
5701         }
5702
5703         Newx(newpat, newlen, char);
5704         p = newpat;
5705         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5706
5707         for (s = 0; s < plen; s++) {
5708             if (n < pRExC_state->num_code_blocks
5709                 && s == pRExC_state->code_blocks[n].start)
5710             {
5711                 /* blank out literal code block */
5712                 assert(pat[s] == '(');
5713                 while (s <= pRExC_state->code_blocks[n].end) {
5714                     *p++ = '_';
5715                     s++;
5716                 }
5717                 s--;
5718                 n++;
5719                 continue;
5720             }
5721             if (pat[s] == '\'' || pat[s] == '\\')
5722                 *p++ = '\\';
5723             *p++ = pat[s];
5724         }
5725         *p++ = '\'';
5726         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5727             *p++ = 'x';
5728         *p++ = '\0';
5729         DEBUG_COMPILE_r({
5730             PerlIO_printf(Perl_debug_log,
5731                 "%sre-parsing pattern for runtime code:%s %s\n",
5732                 PL_colors[4],PL_colors[5],newpat);
5733         });
5734
5735         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5736         Safefree(newpat);
5737
5738         ENTER;
5739         SAVETMPS;
5740         save_re_context();
5741         PUSHSTACKi(PERLSI_REQUIRE);
5742         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5743          * parsing qr''; normally only q'' does this. It also alters
5744          * hints handling */
5745         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5746         SvREFCNT_dec_NN(sv);
5747         SPAGAIN;
5748         qr_ref = POPs;
5749         PUTBACK;
5750         {
5751             SV * const errsv = ERRSV;
5752             if (SvTRUE_NN(errsv))
5753             {
5754                 Safefree(pRExC_state->code_blocks);
5755                 /* use croak_sv ? */
5756                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5757             }
5758         }
5759         assert(SvROK(qr_ref));
5760         qr = SvRV(qr_ref);
5761         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5762         /* the leaving below frees the tmp qr_ref.
5763          * Give qr a life of its own */
5764         SvREFCNT_inc(qr);
5765         POPSTACK;
5766         FREETMPS;
5767         LEAVE;
5768
5769     }
5770
5771     if (!RExC_utf8 && SvUTF8(qr)) {
5772         /* first time through; the pattern got upgraded; save the
5773          * qr for the next time through */
5774         assert(!pRExC_state->runtime_code_qr);
5775         pRExC_state->runtime_code_qr = qr;
5776         return 0;
5777     }
5778
5779
5780     /* extract any code blocks within the returned qr//  */
5781
5782
5783     /* merge the main (r1) and run-time (r2) code blocks into one */
5784     {
5785         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5786         struct reg_code_block *new_block, *dst;
5787         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5788         int i1 = 0, i2 = 0;
5789
5790         if (!r2->num_code_blocks) /* we guessed wrong */
5791         {
5792             SvREFCNT_dec_NN(qr);
5793             return 1;
5794         }
5795
5796         Newx(new_block,
5797             r1->num_code_blocks + r2->num_code_blocks,
5798             struct reg_code_block);
5799         dst = new_block;
5800
5801         while (    i1 < r1->num_code_blocks
5802                 || i2 < r2->num_code_blocks)
5803         {
5804             struct reg_code_block *src;
5805             bool is_qr = 0;
5806
5807             if (i1 == r1->num_code_blocks) {
5808                 src = &r2->code_blocks[i2++];
5809                 is_qr = 1;
5810             }
5811             else if (i2 == r2->num_code_blocks)
5812                 src = &r1->code_blocks[i1++];
5813             else if (  r1->code_blocks[i1].start
5814                      < r2->code_blocks[i2].start)
5815             {
5816                 src = &r1->code_blocks[i1++];
5817                 assert(src->end < r2->code_blocks[i2].start);
5818             }
5819             else {
5820                 assert(  r1->code_blocks[i1].start
5821                        > r2->code_blocks[i2].start);
5822                 src = &r2->code_blocks[i2++];
5823                 is_qr = 1;
5824                 assert(src->end < r1->code_blocks[i1].start);
5825             }
5826
5827             assert(pat[src->start] == '(');
5828             assert(pat[src->end]   == ')');
5829             dst->start      = src->start;
5830             dst->end        = src->end;
5831             dst->block      = src->block;
5832             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5833                                     : src->src_regex;
5834             dst++;
5835         }
5836         r1->num_code_blocks += r2->num_code_blocks;
5837         Safefree(r1->code_blocks);
5838         r1->code_blocks = new_block;
5839     }
5840
5841     SvREFCNT_dec_NN(qr);
5842     return 1;
5843 }
5844
5845
5846 STATIC bool
5847 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5848                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5849 {
5850     /* This is the common code for setting up the floating and fixed length
5851      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5852      * as to whether succeeded or not */
5853
5854     I32 t;
5855     SSize_t ml;
5856
5857     if (! (longest_length
5858            || (eol /* Can't have SEOL and MULTI */
5859                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5860           )
5861             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5862         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5863     {
5864         return FALSE;
5865     }
5866
5867     /* copy the information about the longest from the reg_scan_data
5868         over to the program. */
5869     if (SvUTF8(sv_longest)) {
5870         *rx_utf8 = sv_longest;
5871         *rx_substr = NULL;
5872     } else {
5873         *rx_substr = sv_longest;
5874         *rx_utf8 = NULL;
5875     }
5876     /* end_shift is how many chars that must be matched that
5877         follow this item. We calculate it ahead of time as once the
5878         lookbehind offset is added in we lose the ability to correctly
5879         calculate it.*/
5880     ml = minlen ? *(minlen) : (SSize_t)longest_length;
5881     *rx_end_shift = ml - offset
5882         - longest_length + (SvTAIL(sv_longest) != 0)
5883         + lookbehind;
5884
5885     t = (eol/* Can't have SEOL and MULTI */
5886          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5887     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5888
5889     return TRUE;
5890 }
5891
5892 /*
5893  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5894  * regular expression into internal code.
5895  * The pattern may be passed either as:
5896  *    a list of SVs (patternp plus pat_count)
5897  *    a list of OPs (expr)
5898  * If both are passed, the SV list is used, but the OP list indicates
5899  * which SVs are actually pre-compiled code blocks
5900  *
5901  * The SVs in the list have magic and qr overloading applied to them (and
5902  * the list may be modified in-place with replacement SVs in the latter
5903  * case).
5904  *
5905  * If the pattern hasn't changed from old_re, then old_re will be
5906  * returned.
5907  *
5908  * eng is the current engine. If that engine has an op_comp method, then
5909  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5910  * do the initial concatenation of arguments and pass on to the external
5911  * engine.
5912  *
5913  * If is_bare_re is not null, set it to a boolean indicating whether the
5914  * arg list reduced (after overloading) to a single bare regex which has
5915  * been returned (i.e. /$qr/).
5916  *
5917  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5918  *
5919  * pm_flags contains the PMf_* flags, typically based on those from the
5920  * pm_flags field of the related PMOP. Currently we're only interested in
5921  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5922  *
5923  * We can't allocate space until we know how big the compiled form will be,
5924  * but we can't compile it (and thus know how big it is) until we've got a
5925  * place to put the code.  So we cheat:  we compile it twice, once with code
5926  * generation turned off and size counting turned on, and once "for real".
5927  * This also means that we don't allocate space until we are sure that the
5928  * thing really will compile successfully, and we never have to move the
5929  * code and thus invalidate pointers into it.  (Note that it has to be in
5930  * one piece because free() must be able to free it all.) [NB: not true in perl]
5931  *
5932  * Beware that the optimization-preparation code in here knows about some
5933  * of the structure of the compiled regexp.  [I'll say.]
5934  */
5935
5936 REGEXP *
5937 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5938                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5939                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5940 {
5941     dVAR;
5942     REGEXP *rx;
5943     struct regexp *r;
5944     regexp_internal *ri;
5945     STRLEN plen;
5946     char *exp;
5947     regnode *scan;
5948     I32 flags;
5949     SSize_t minlen = 0;
5950     U32 rx_flags;
5951     SV *pat;
5952     SV *code_blocksv = NULL;
5953     SV** new_patternp = patternp;
5954
5955     /* these are all flags - maybe they should be turned
5956      * into a single int with different bit masks */
5957     I32 sawlookahead = 0;
5958     I32 sawplus = 0;
5959     I32 sawopen = 0;
5960     I32 sawminmod = 0;
5961
5962     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5963     bool recompile = 0;
5964     bool runtime_code = 0;
5965     scan_data_t data;
5966     RExC_state_t RExC_state;
5967     RExC_state_t * const pRExC_state = &RExC_state;
5968 #ifdef TRIE_STUDY_OPT
5969     int restudied = 0;
5970     RExC_state_t copyRExC_state;
5971 #endif
5972     GET_RE_DEBUG_FLAGS_DECL;
5973
5974     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5975
5976     DEBUG_r(if (!PL_colorset) reginitcolors());
5977
5978 #ifndef PERL_IN_XSUB_RE
5979     /* Initialize these here instead of as-needed, as is quick and avoids
5980      * having to test them each time otherwise */
5981     if (! PL_AboveLatin1) {
5982         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5983         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5984         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5985
5986         PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5987         PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5988         PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5989
5990         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5991                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5992         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5993                                 = _new_invlist_C_array(PosixAlnum_invlist);
5994
5995         PL_L1Posix_ptrs[_CC_ALPHA]
5996                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5997         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5998
5999         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
6000         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
6001
6002         /* Cased is the same as Alpha in the ASCII range */
6003         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
6004         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
6005
6006         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
6007         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
6008
6009         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
6010         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
6011
6012         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
6013         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
6014
6015         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
6016         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
6017
6018         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
6019         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
6020
6021         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
6022         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
6023
6024         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
6025         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
6026         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
6027         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
6028
6029         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
6030         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
6031
6032         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
6033
6034         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
6035         PL_L1Posix_ptrs[_CC_WORDCHAR]
6036                                 = _new_invlist_C_array(L1PosixWord_invlist);
6037
6038         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
6039         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
6040
6041         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
6042     }
6043 #endif
6044
6045     pRExC_state->code_blocks = NULL;
6046     pRExC_state->num_code_blocks = 0;
6047
6048     if (is_bare_re)
6049         *is_bare_re = FALSE;
6050
6051     if (expr && (expr->op_type == OP_LIST ||
6052                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6053         /* allocate code_blocks if needed */
6054         OP *o;
6055         int ncode = 0;
6056
6057         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6058             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6059                 ncode++; /* count of DO blocks */
6060         if (ncode) {
6061             pRExC_state->num_code_blocks = ncode;
6062             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6063         }
6064     }
6065
6066     if (!pat_count) {
6067         /* compile-time pattern with just OP_CONSTs and DO blocks */
6068
6069         int n;
6070         OP *o;
6071
6072         /* find how many CONSTs there are */
6073         assert(expr);
6074         n = 0;
6075         if (expr->op_type == OP_CONST)
6076             n = 1;
6077         else
6078             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6079                 if (o->op_type == OP_CONST)
6080                     n++;
6081             }
6082
6083         /* fake up an SV array */
6084
6085         assert(!new_patternp);
6086         Newx(new_patternp, n, SV*);
6087         SAVEFREEPV(new_patternp);
6088         pat_count = n;
6089
6090         n = 0;
6091         if (expr->op_type == OP_CONST)
6092             new_patternp[n] = cSVOPx_sv(expr);
6093         else
6094             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6095                 if (o->op_type == OP_CONST)
6096                     new_patternp[n++] = cSVOPo_sv;
6097             }
6098
6099     }
6100
6101     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6102         "Assembling pattern from %d elements%s\n", pat_count,
6103             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6104
6105     /* set expr to the first arg op */
6106
6107     if (pRExC_state->num_code_blocks
6108          && expr->op_type != OP_CONST)
6109     {
6110             expr = cLISTOPx(expr)->op_first;
6111             assert(   expr->op_type == OP_PUSHMARK
6112                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6113                    || expr->op_type == OP_PADRANGE);
6114             expr = expr->op_sibling;
6115     }
6116
6117     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6118                         expr, &recompile, NULL);
6119
6120     /* handle bare (possibly after overloading) regex: foo =~ $re */
6121     {
6122         SV *re = pat;
6123         if (SvROK(re))
6124             re = SvRV(re);
6125         if (SvTYPE(re) == SVt_REGEXP) {
6126             if (is_bare_re)
6127                 *is_bare_re = TRUE;
6128             SvREFCNT_inc(re);
6129             Safefree(pRExC_state->code_blocks);
6130             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6131                 "Precompiled pattern%s\n",
6132                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6133
6134             return (REGEXP*)re;
6135         }
6136     }
6137
6138     exp = SvPV_nomg(pat, plen);
6139
6140     if (!eng->op_comp) {
6141         if ((SvUTF8(pat) && IN_BYTES)
6142                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6143         {
6144             /* make a temporary copy; either to convert to bytes,
6145              * or to avoid repeating get-magic / overloaded stringify */
6146             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6147                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6148         }
6149         Safefree(pRExC_state->code_blocks);
6150         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6151     }
6152
6153     /* ignore the utf8ness if the pattern is 0 length */
6154     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6155     RExC_uni_semantics = 0;
6156     RExC_contains_locale = 0;
6157     RExC_contains_i = 0;
6158     pRExC_state->runtime_code_qr = NULL;
6159
6160     DEBUG_COMPILE_r({
6161             SV *dsv= sv_newmortal();
6162             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6163             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6164                           PL_colors[4],PL_colors[5],s);
6165         });
6166
6167   redo_first_pass:
6168     /* we jump here if we upgrade the pattern to utf8 and have to
6169      * recompile */
6170
6171     if ((pm_flags & PMf_USE_RE_EVAL)
6172                 /* this second condition covers the non-regex literal case,
6173                  * i.e.  $foo =~ '(?{})'. */
6174                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6175     )
6176         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6177
6178     /* return old regex if pattern hasn't changed */
6179     /* XXX: note in the below we have to check the flags as well as the pattern.
6180      *
6181      * Things get a touch tricky as we have to compare the utf8 flag independently
6182      * from the compile flags.
6183      */
6184
6185     if (   old_re
6186         && !recompile
6187         && !!RX_UTF8(old_re) == !!RExC_utf8
6188         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6189         && RX_PRECOMP(old_re)
6190         && RX_PRELEN(old_re) == plen
6191         && memEQ(RX_PRECOMP(old_re), exp, plen)
6192         && !runtime_code /* with runtime code, always recompile */ )
6193     {
6194         Safefree(pRExC_state->code_blocks);
6195         return old_re;
6196     }
6197
6198     rx_flags = orig_rx_flags;
6199
6200     if (rx_flags & PMf_FOLD) {
6201         RExC_contains_i = 1;
6202     }
6203     if (initial_charset == REGEX_LOCALE_CHARSET) {
6204         RExC_contains_locale = 1;
6205     }
6206     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6207
6208         /* Set to use unicode semantics if the pattern is in utf8 and has the
6209          * 'depends' charset specified, as it means unicode when utf8  */
6210         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6211     }
6212
6213     RExC_precomp = exp;
6214     RExC_flags = rx_flags;
6215     RExC_pm_flags = pm_flags;
6216
6217     if (runtime_code) {
6218         if (TAINTING_get && TAINT_get)
6219             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6220
6221         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6222             /* whoops, we have a non-utf8 pattern, whilst run-time code
6223              * got compiled as utf8. Try again with a utf8 pattern */
6224             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6225                                     pRExC_state->num_code_blocks);
6226             goto redo_first_pass;
6227         }
6228     }
6229     assert(!pRExC_state->runtime_code_qr);
6230
6231     RExC_sawback = 0;
6232
6233     RExC_seen = 0;
6234     RExC_in_lookbehind = 0;
6235     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6236     RExC_extralen = 0;
6237     RExC_override_recoding = 0;
6238     RExC_in_multi_char_class = 0;
6239
6240     /* First pass: determine size, legality. */
6241     RExC_parse = exp;
6242     RExC_start = exp;
6243     RExC_end = exp + plen;
6244     RExC_naughty = 0;
6245     RExC_npar = 1;
6246     RExC_nestroot = 0;
6247     RExC_size = 0L;
6248     RExC_emit = (regnode *) &RExC_emit_dummy;
6249     RExC_whilem_seen = 0;
6250     RExC_open_parens = NULL;
6251     RExC_close_parens = NULL;
6252     RExC_opend = NULL;
6253     RExC_paren_names = NULL;
6254 #ifdef DEBUGGING
6255     RExC_paren_name_list = NULL;
6256 #endif
6257     RExC_recurse = NULL;
6258     RExC_study_chunk_recursed = NULL;
6259     RExC_study_chunk_recursed_bytes= 0;
6260     RExC_recurse_count = 0;
6261     pRExC_state->code_index = 0;
6262
6263 #if 0 /* REGC() is (currently) a NOP at the first pass.
6264        * Clever compilers notice this and complain. --jhi */
6265     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6266 #endif
6267     DEBUG_PARSE_r(
6268         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6269         RExC_lastnum=0;
6270         RExC_lastparse=NULL;
6271     );
6272     /* reg may croak on us, not giving us a chance to free
6273        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6274        need it to survive as long as the regexp (qr/(?{})/).
6275        We must check that code_blocksv is not already set, because we may
6276        have jumped back to restart the sizing pass. */
6277     if (pRExC_state->code_blocks && !code_blocksv) {
6278         code_blocksv = newSV_type(SVt_PV);
6279         SAVEFREESV(code_blocksv);
6280         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6281         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6282     }
6283     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6284         /* It's possible to write a regexp in ascii that represents Unicode
6285         codepoints outside of the byte range, such as via \x{100}. If we
6286         detect such a sequence we have to convert the entire pattern to utf8
6287         and then recompile, as our sizing calculation will have been based
6288         on 1 byte == 1 character, but we will need to use utf8 to encode
6289         at least some part of the pattern, and therefore must convert the whole
6290         thing.
6291         -- dmq */
6292         if (flags & RESTART_UTF8) {
6293             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6294                                     pRExC_state->num_code_blocks);
6295             goto redo_first_pass;
6296         }
6297         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6298     }
6299     if (code_blocksv)
6300         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6301
6302     DEBUG_PARSE_r({
6303         PerlIO_printf(Perl_debug_log,
6304             "Required size %"IVdf" nodes\n"
6305             "Starting second pass (creation)\n",
6306             (IV)RExC_size);
6307         RExC_lastnum=0;
6308         RExC_lastparse=NULL;
6309     });
6310
6311     /* The first pass could have found things that force Unicode semantics */
6312     if ((RExC_utf8 || RExC_uni_semantics)
6313          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6314     {
6315         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6316     }
6317
6318     /* Small enough for pointer-storage convention?
6319        If extralen==0, this means that we will not need long jumps. */
6320     if (RExC_size >= 0x10000L && RExC_extralen)
6321         RExC_size += RExC_extralen;
6322     else
6323         RExC_extralen = 0;
6324     if (RExC_whilem_seen > 15)
6325         RExC_whilem_seen = 15;
6326
6327     /* Allocate space and zero-initialize. Note, the two step process
6328        of zeroing when in debug mode, thus anything assigned has to
6329        happen after that */
6330     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6331     r = ReANY(rx);
6332     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6333          char, regexp_internal);
6334     if ( r == NULL || ri == NULL )
6335         FAIL("Regexp out of space");
6336 #ifdef DEBUGGING
6337     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6338     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6339 #else
6340     /* bulk initialize base fields with 0. */
6341     Zero(ri, sizeof(regexp_internal), char);
6342 #endif
6343
6344     /* non-zero initialization begins here */
6345     RXi_SET( r, ri );
6346     r->engine= eng;
6347     r->extflags = rx_flags;
6348     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6349
6350     if (pm_flags & PMf_IS_QR) {
6351         ri->code_blocks = pRExC_state->code_blocks;
6352         ri->num_code_blocks = pRExC_state->num_code_blocks;
6353     }
6354     else
6355     {
6356         int n;
6357         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6358             if (pRExC_state->code_blocks[n].src_regex)
6359                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6360         SAVEFREEPV(pRExC_state->code_blocks);
6361     }
6362
6363     {
6364         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6365         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6366
6367         /* The caret is output if there are any defaults: if not all the STD
6368          * flags are set, or if no character set specifier is needed */
6369         bool has_default =
6370                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6371                     || ! has_charset);
6372         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6373         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6374                             >> RXf_PMf_STD_PMMOD_SHIFT);
6375         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6376         char *p;
6377         /* Allocate for the worst case, which is all the std flags are turned
6378          * on.  If more precision is desired, we could do a population count of
6379          * the flags set.  This could be done with a small lookup table, or by
6380          * shifting, masking and adding, or even, when available, assembly
6381          * language for a machine-language population count.
6382          * We never output a minus, as all those are defaults, so are
6383          * covered by the caret */
6384         const STRLEN wraplen = plen + has_p + has_runon
6385             + has_default       /* If needs a caret */
6386
6387                 /* If needs a character set specifier */
6388             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6389             + (sizeof(STD_PAT_MODS) - 1)
6390             + (sizeof("(?:)") - 1);
6391
6392         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6393         r->xpv_len_u.xpvlenu_pv = p;
6394         if (RExC_utf8)
6395             SvFLAGS(rx) |= SVf_UTF8;
6396         *p++='('; *p++='?';
6397
6398         /* If a default, cover it using the caret */
6399         if (has_default) {
6400             *p++= DEFAULT_PAT_MOD;
6401         }
6402         if (has_charset) {
6403             STRLEN len;
6404             const char* const name = get_regex_charset_name(r->extflags, &len);
6405             Copy(name, p, len, char);
6406             p += len;
6407         }
6408         if (has_p)
6409             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6410         {
6411             char ch;
6412             while((ch = *fptr++)) {
6413                 if(reganch & 1)
6414                     *p++ = ch;
6415                 reganch >>= 1;
6416             }
6417         }
6418
6419         *p++ = ':';
6420         Copy(RExC_precomp, p, plen, char);
6421         assert ((RX_WRAPPED(rx) - p) < 16);
6422         r->pre_prefix = p - RX_WRAPPED(rx);
6423         p += plen;
6424         if (has_runon)
6425             *p++ = '\n';
6426         *p++ = ')';
6427         *p = 0;
6428         SvCUR_set(rx, p - RX_WRAPPED(rx));
6429     }
6430
6431     r->intflags = 0;
6432     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6433
6434     /* setup various meta data about recursion, this all requires
6435      * RExC_npar to be correctly set, and a bit later on we clear it */
6436     if (RExC_seen & REG_SEEN_RECURSE) {
6437         Newxz(RExC_open_parens, RExC_npar,regnode *);
6438         SAVEFREEPV(RExC_open_parens);
6439         Newxz(RExC_close_parens,RExC_npar,regnode *);
6440         SAVEFREEPV(RExC_close_parens);
6441     }
6442     if (RExC_seen & (REG_SEEN_RECURSE | REG_SEEN_GOSTART)) {
6443         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6444          * So its 1 if there are no parens. */
6445         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6446                                          ((RExC_npar & 0x07) != 0);
6447         Newx(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6448         SAVEFREEPV(RExC_study_chunk_recursed);
6449     }
6450
6451     /* Useful during FAIL. */
6452 #ifdef RE_TRACK_PATTERN_OFFSETS
6453     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6454     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6455                           "%s %"UVuf" bytes for offset annotations.\n",
6456                           ri->u.offsets ? "Got" : "Couldn't get",
6457                           (UV)((2*RExC_size+1) * sizeof(U32))));
6458 #endif
6459     SetProgLen(ri,RExC_size);
6460     RExC_rx_sv = rx;
6461     RExC_rx = r;
6462     RExC_rxi = ri;
6463
6464     /* Second pass: emit code. */
6465     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6466     RExC_pm_flags = pm_flags;
6467     RExC_parse = exp;
6468     RExC_end = exp + plen;
6469     RExC_naughty = 0;
6470     RExC_npar = 1;
6471     RExC_emit_start = ri->program;
6472     RExC_emit = ri->program;
6473     RExC_emit_bound = ri->program + RExC_size + 1;
6474     pRExC_state->code_index = 0;
6475
6476     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6477     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6478         ReREFCNT_dec(rx);
6479         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6480     }
6481     /* XXXX To minimize changes to RE engine we always allocate
6482        3-units-long substrs field. */
6483     Newx(r->substrs, 1, struct reg_substr_data);
6484     if (RExC_recurse_count) {
6485         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6486         SAVEFREEPV(RExC_recurse);
6487     }
6488
6489 reStudy:
6490     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6491     Zero(r->substrs, 1, struct reg_substr_data);
6492     if (RExC_study_chunk_recursed)
6493         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6494
6495 #ifdef TRIE_STUDY_OPT
6496     if (!restudied) {
6497         StructCopy(&zero_scan_data, &data, scan_data_t);
6498         copyRExC_state = RExC_state;
6499     } else {
6500         U32 seen=RExC_seen;
6501         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6502
6503         RExC_state = copyRExC_state;
6504         if (seen & REG_TOP_LEVEL_BRANCHES)
6505             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6506         else
6507             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6508         StructCopy(&zero_scan_data, &data, scan_data_t);
6509     }
6510 #else
6511     StructCopy(&zero_scan_data, &data, scan_data_t);
6512 #endif
6513
6514     /* Dig out information for optimizations. */
6515     r->extflags = RExC_flags; /* was pm_op */
6516     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6517
6518     if (UTF)
6519         SvUTF8_on(rx);  /* Unicode in it? */
6520     ri->regstclass = NULL;
6521     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6522         r->intflags |= PREGf_NAUGHTY;
6523     scan = ri->program + 1;             /* First BRANCH. */
6524
6525     /* testing for BRANCH here tells us whether there is "must appear"
6526        data in the pattern. If there is then we can use it for optimisations */
6527     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6528         SSize_t fake;
6529         STRLEN longest_float_length, longest_fixed_length;
6530         regnode_ssc ch_class; /* pointed to by data */
6531         int stclass_flag;
6532         SSize_t last_close = 0; /* pointed to by data */
6533         regnode *first= scan;
6534         regnode *first_next= regnext(first);
6535         /*
6536          * Skip introductions and multiplicators >= 1
6537          * so that we can extract the 'meat' of the pattern that must
6538          * match in the large if() sequence following.
6539          * NOTE that EXACT is NOT covered here, as it is normally
6540          * picked up by the optimiser separately.
6541          *
6542          * This is unfortunate as the optimiser isnt handling lookahead
6543          * properly currently.
6544          *
6545          */
6546         while ((OP(first) == OPEN && (sawopen = 1)) ||
6547                /* An OR of *one* alternative - should not happen now. */
6548             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6549             /* for now we can't handle lookbehind IFMATCH*/
6550             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6551             (OP(first) == PLUS) ||
6552             (OP(first) == MINMOD) ||
6553                /* An {n,m} with n>0 */
6554             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6555             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6556         {
6557                 /*
6558                  * the only op that could be a regnode is PLUS, all the rest
6559                  * will be regnode_1 or regnode_2.
6560                  *
6561                  * (yves doesn't think this is true)
6562                  */
6563                 if (OP(first) == PLUS)
6564                     sawplus = 1;
6565                 else {
6566                     if (OP(first) == MINMOD)
6567                         sawminmod = 1;
6568                     first += regarglen[OP(first)];
6569                 }
6570                 first = NEXTOPER(first);
6571                 first_next= regnext(first);
6572         }
6573
6574         /* Starting-point info. */
6575       again:
6576         DEBUG_PEEP("first:",first,0);
6577         /* Ignore EXACT as we deal with it later. */
6578         if (PL_regkind[OP(first)] == EXACT) {
6579             if (OP(first) == EXACT)
6580                 NOOP;   /* Empty, get anchored substr later. */
6581             else
6582                 ri->regstclass = first;
6583         }
6584 #ifdef TRIE_STCLASS
6585         else if (PL_regkind[OP(first)] == TRIE &&
6586                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6587         {
6588             regnode *trie_op;
6589             /* this can happen only on restudy */
6590             if ( OP(first) == TRIE ) {
6591                 struct regnode_1 *trieop = (struct regnode_1 *)
6592                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6593                 StructCopy(first,trieop,struct regnode_1);
6594                 trie_op=(regnode *)trieop;
6595             } else {
6596                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6597                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6598                 StructCopy(first,trieop,struct regnode_charclass);
6599                 trie_op=(regnode *)trieop;
6600             }
6601             OP(trie_op)+=2;
6602             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6603             ri->regstclass = trie_op;
6604         }
6605 #endif
6606         else if (REGNODE_SIMPLE(OP(first)))
6607             ri->regstclass = first;
6608         else if (PL_regkind[OP(first)] == BOUND ||
6609                  PL_regkind[OP(first)] == NBOUND)
6610             ri->regstclass = first;
6611         else if (PL_regkind[OP(first)] == BOL) {
6612             r->extflags |= (OP(first) == MBOL
6613                            ? RXf_ANCH_MBOL
6614                            : (OP(first) == SBOL
6615                               ? RXf_ANCH_SBOL
6616                               : RXf_ANCH_BOL));
6617             first = NEXTOPER(first);
6618             goto again;
6619         }
6620         else if (OP(first) == GPOS) {
6621             r->extflags |= RXf_ANCH_GPOS;
6622             first = NEXTOPER(first);
6623             goto again;
6624         }
6625         else if ((!sawopen || !RExC_sawback) &&
6626             (OP(first) == STAR &&
6627             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6628             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6629         {
6630             /* turn .* into ^.* with an implied $*=1 */
6631             const int type =
6632                 (OP(NEXTOPER(first)) == REG_ANY)
6633                     ? RXf_ANCH_MBOL
6634                     : RXf_ANCH_SBOL;
6635             r->extflags |= type;
6636             r->intflags |= PREGf_IMPLICIT;
6637             first = NEXTOPER(first);
6638             goto again;
6639         }
6640         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6641             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6642             /* x+ must match at the 1st pos of run of x's */
6643             r->intflags |= PREGf_SKIP;
6644
6645         /* Scan is after the zeroth branch, first is atomic matcher. */
6646 #ifdef TRIE_STUDY_OPT
6647         DEBUG_PARSE_r(
6648             if (!restudied)
6649                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6650                               (IV)(first - scan + 1))
6651         );
6652 #else
6653         DEBUG_PARSE_r(
6654             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6655                 (IV)(first - scan + 1))
6656         );
6657 #endif
6658
6659
6660         /*
6661         * If there's something expensive in the r.e., find the
6662         * longest literal string that must appear and make it the
6663         * regmust.  Resolve ties in favor of later strings, since
6664         * the regstart check works with the beginning of the r.e.
6665         * and avoiding duplication strengthens checking.  Not a
6666         * strong reason, but sufficient in the absence of others.
6667         * [Now we resolve ties in favor of the earlier string if
6668         * it happens that c_offset_min has been invalidated, since the
6669         * earlier string may buy us something the later one won't.]
6670         */
6671
6672         data.longest_fixed = newSVpvs("");
6673         data.longest_float = newSVpvs("");
6674         data.last_found = newSVpvs("");
6675         data.longest = &(data.longest_fixed);
6676         ENTER_with_name("study_chunk");
6677         SAVEFREESV(data.longest_fixed);
6678         SAVEFREESV(data.longest_float);
6679         SAVEFREESV(data.last_found);
6680         first = scan;
6681         if (!ri->regstclass) {
6682             ssc_init(pRExC_state, &ch_class);
6683             data.start_class = &ch_class;
6684             stclass_flag = SCF_DO_STCLASS_AND;
6685         } else                          /* XXXX Check for BOUND? */
6686             stclass_flag = 0;
6687         data.last_closep = &last_close;
6688
6689         DEBUG_RExC_seen();
6690         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6691             &data, -1, 0, NULL,
6692             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6693                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6694             0);
6695
6696
6697         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6698
6699
6700         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6701              && data.last_start_min == 0 && data.last_end > 0
6702              && !RExC_seen_zerolen
6703              && !(RExC_seen & REG_SEEN_VERBARG)
6704              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6705             r->extflags |= RXf_CHECK_ALL;
6706         scan_commit(pRExC_state, &data,&minlen,0);
6707
6708         longest_float_length = CHR_SVLEN(data.longest_float);
6709
6710         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6711                    && data.offset_fixed == data.offset_float_min
6712                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6713             && S_setup_longest (aTHX_ pRExC_state,
6714                                     data.longest_float,
6715                                     &(r->float_utf8),
6716                                     &(r->float_substr),
6717                                     &(r->float_end_shift),
6718                                     data.lookbehind_float,
6719                                     data.offset_float_min,
6720                                     data.minlen_float,
6721                                     longest_float_length,
6722                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6723                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6724         {
6725             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6726             r->float_max_offset = data.offset_float_max;
6727             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6728                 r->float_max_offset -= data.lookbehind_float;
6729             SvREFCNT_inc_simple_void_NN(data.longest_float);
6730         }
6731         else {
6732             r->float_substr = r->float_utf8 = NULL;
6733             longest_float_length = 0;
6734         }
6735
6736         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6737
6738         if (S_setup_longest (aTHX_ pRExC_state,
6739                                 data.longest_fixed,
6740                                 &(r->anchored_utf8),
6741                                 &(r->anchored_substr),
6742                                 &(r->anchored_end_shift),
6743                                 data.lookbehind_fixed,
6744                                 data.offset_fixed,
6745                                 data.minlen_fixed,
6746                                 longest_fixed_length,
6747                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6748                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6749         {
6750             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6751             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6752         }
6753         else {
6754             r->anchored_substr = r->anchored_utf8 = NULL;
6755             longest_fixed_length = 0;
6756         }
6757         LEAVE_with_name("study_chunk");
6758
6759         if (ri->regstclass
6760             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6761             ri->regstclass = NULL;
6762
6763         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6764             && stclass_flag
6765             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6766             && !ssc_is_anything(data.start_class))
6767         {
6768             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6769
6770             ssc_finalize(pRExC_state, data.start_class);
6771
6772             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6773             StructCopy(data.start_class,
6774                        (regnode_ssc*)RExC_rxi->data->data[n],
6775                        regnode_ssc);
6776             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6777             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6778             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6779                       regprop(r, sv, (regnode*)data.start_class);
6780                       PerlIO_printf(Perl_debug_log,
6781                                     "synthetic stclass \"%s\".\n",
6782                                     SvPVX_const(sv));});
6783             data.start_class = NULL;
6784         }
6785
6786         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6787         if (longest_fixed_length > longest_float_length) {
6788             r->check_end_shift = r->anchored_end_shift;
6789             r->check_substr = r->anchored_substr;
6790             r->check_utf8 = r->anchored_utf8;
6791             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6792             if (r->extflags & RXf_ANCH_SINGLE)
6793                 r->extflags |= RXf_NOSCAN;
6794         }
6795         else {
6796             r->check_end_shift = r->float_end_shift;
6797             r->check_substr = r->float_substr;
6798             r->check_utf8 = r->float_utf8;
6799             r->check_offset_min = r->float_min_offset;
6800             r->check_offset_max = r->float_max_offset;
6801         }
6802         if ((r->check_substr || r->check_utf8) ) {
6803             r->extflags |= RXf_USE_INTUIT;
6804             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6805                 r->extflags |= RXf_INTUIT_TAIL;
6806         }
6807         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6808         if ( (STRLEN)minlen < longest_float_length )
6809             minlen= longest_float_length;
6810         if ( (STRLEN)minlen < longest_fixed_length )
6811             minlen= longest_fixed_length;
6812         */
6813     }
6814     else {
6815         /* Several toplevels. Best we can is to set minlen. */
6816         SSize_t fake;
6817         regnode_ssc ch_class;
6818         SSize_t last_close = 0;
6819
6820         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6821
6822         scan = ri->program + 1;
6823         ssc_init(pRExC_state, &ch_class);
6824         data.start_class = &ch_class;
6825         data.last_closep = &last_close;
6826
6827         DEBUG_RExC_seen();
6828         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6829             &data, -1, 0, NULL,
6830             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6831                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6832             0);
6833
6834         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6835
6836         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6837                 = r->float_substr = r->float_utf8 = NULL;
6838
6839         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6840             && ! ssc_is_anything(data.start_class))
6841         {
6842             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6843
6844             ssc_finalize(pRExC_state, data.start_class);
6845
6846             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6847             StructCopy(data.start_class,
6848                        (regnode_ssc*)RExC_rxi->data->data[n],
6849                        regnode_ssc);
6850             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6851             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6852             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6853                       regprop(r, sv, (regnode*)data.start_class);
6854                       PerlIO_printf(Perl_debug_log,
6855                                     "synthetic stclass \"%s\".\n",
6856                                     SvPVX_const(sv));});
6857             data.start_class = NULL;
6858         }
6859     }
6860
6861     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6862        the "real" pattern. */
6863     DEBUG_OPTIMISE_r({
6864         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6865                       (IV)minlen, (IV)r->minlen);
6866     });
6867     r->minlenret = minlen;
6868     if (r->minlen < minlen)
6869         r->minlen = minlen;
6870
6871     if (RExC_seen & REG_SEEN_GPOS)
6872         r->extflags |= RXf_GPOS_SEEN;
6873     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6874         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6875     if (pRExC_state->num_code_blocks)
6876         r->extflags |= RXf_EVAL_SEEN;
6877     if (RExC_seen & REG_SEEN_CANY)
6878         r->extflags |= RXf_CANY_SEEN;
6879     if (RExC_seen & REG_SEEN_VERBARG)
6880     {
6881         r->intflags |= PREGf_VERBARG_SEEN;
6882         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6883     }
6884     if (RExC_seen & REG_SEEN_CUTGROUP)
6885         r->intflags |= PREGf_CUTGROUP_SEEN;
6886     if (pm_flags & PMf_USE_RE_EVAL)
6887         r->intflags |= PREGf_USE_RE_EVAL;
6888     if (RExC_paren_names)
6889         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6890     else
6891         RXp_PAREN_NAMES(r) = NULL;
6892
6893     {
6894         regnode *first = ri->program + 1;
6895         U8 fop = OP(first);
6896         regnode *next = NEXTOPER(first);
6897         U8 nop = OP(next);
6898
6899         if (PL_regkind[fop] == NOTHING && nop == END)
6900             r->extflags |= RXf_NULL;
6901         else if (PL_regkind[fop] == BOL && nop == END)
6902             r->extflags |= RXf_START_ONLY;
6903         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6904             r->extflags |= RXf_WHITE;
6905         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6906             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6907
6908     }
6909 #ifdef DEBUGGING
6910     if (RExC_paren_names) {
6911         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6912         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6913     } else
6914 #endif
6915         ri->name_list_idx = 0;
6916
6917     if (RExC_recurse_count) {
6918         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6919             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6920             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6921         }
6922     }
6923     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6924     /* assume we don't need to swap parens around before we match */
6925
6926     DEBUG_DUMP_r({
6927         DEBUG_RExC_seen();
6928         PerlIO_printf(Perl_debug_log,"Final program:\n");
6929         regdump(r);
6930     });
6931 #ifdef RE_TRACK_PATTERN_OFFSETS
6932     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6933         const STRLEN len = ri->u.offsets[0];
6934         STRLEN i;
6935         GET_RE_DEBUG_FLAGS_DECL;
6936         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6937         for (i = 1; i <= len; i++) {
6938             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6939                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6940                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6941             }
6942         PerlIO_printf(Perl_debug_log, "\n");
6943     });
6944 #endif
6945
6946 #ifdef USE_ITHREADS
6947     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6948      * by setting the regexp SV to readonly-only instead. If the
6949      * pattern's been recompiled, the USEDness should remain. */
6950     if (old_re && SvREADONLY(old_re))
6951         SvREADONLY_on(rx);
6952 #endif
6953     return rx;
6954 }
6955
6956
6957 SV*
6958 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6959                     const U32 flags)
6960 {
6961     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6962
6963     PERL_UNUSED_ARG(value);
6964
6965     if (flags & RXapif_FETCH) {
6966         return reg_named_buff_fetch(rx, key, flags);
6967     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6968         Perl_croak_no_modify();
6969         return NULL;
6970     } else if (flags & RXapif_EXISTS) {
6971         return reg_named_buff_exists(rx, key, flags)
6972             ? &PL_sv_yes
6973             : &PL_sv_no;
6974     } else if (flags & RXapif_REGNAMES) {
6975         return reg_named_buff_all(rx, flags);
6976     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6977         return reg_named_buff_scalar(rx, flags);
6978     } else {
6979         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6980         return NULL;
6981     }
6982 }
6983
6984 SV*
6985 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6986                          const U32 flags)
6987 {
6988     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6989     PERL_UNUSED_ARG(lastkey);
6990
6991     if (flags & RXapif_FIRSTKEY)
6992         return reg_named_buff_firstkey(rx, flags);
6993     else if (flags & RXapif_NEXTKEY)
6994         return reg_named_buff_nextkey(rx, flags);
6995     else {
6996         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6997         return NULL;
6998     }
6999 }
7000
7001 SV*
7002 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7003                           const U32 flags)
7004 {
7005     AV *retarray = NULL;
7006     SV *ret;
7007     struct regexp *const rx = ReANY(r);
7008
7009     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7010
7011     if (flags & RXapif_ALL)
7012         retarray=newAV();
7013
7014     if (rx && RXp_PAREN_NAMES(rx)) {
7015         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7016         if (he_str) {
7017             IV i;
7018             SV* sv_dat=HeVAL(he_str);
7019             I32 *nums=(I32*)SvPVX(sv_dat);
7020             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7021                 if ((I32)(rx->nparens) >= nums[i]
7022                     && rx->offs[nums[i]].start != -1
7023                     && rx->offs[nums[i]].end != -1)
7024                 {
7025                     ret = newSVpvs("");
7026                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7027                     if (!retarray)
7028                         return ret;
7029                 } else {
7030                     if (retarray)
7031                         ret = newSVsv(&PL_sv_undef);
7032                 }
7033                 if (retarray)
7034                     av_push(retarray, ret);
7035             }
7036             if (retarray)
7037                 return newRV_noinc(MUTABLE_SV(retarray));
7038         }
7039     }
7040     return NULL;
7041 }
7042
7043 bool
7044 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7045                            const U32 flags)
7046 {
7047     struct regexp *const rx = ReANY(r);
7048
7049     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7050
7051     if (rx && RXp_PAREN_NAMES(rx)) {
7052         if (flags & RXapif_ALL) {
7053             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7054         } else {
7055             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7056             if (sv) {
7057                 SvREFCNT_dec_NN(sv);
7058                 return TRUE;
7059             } else {
7060                 return FALSE;
7061             }
7062         }
7063     } else {
7064         return FALSE;
7065     }
7066 }
7067
7068 SV*
7069 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7070 {
7071     struct regexp *const rx = ReANY(r);
7072
7073     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7074
7075     if ( rx && RXp_PAREN_NAMES(rx) ) {
7076         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7077
7078         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7079     } else {
7080         return FALSE;
7081     }
7082 }
7083
7084 SV*
7085 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7086 {
7087     struct regexp *const rx = ReANY(r);
7088     GET_RE_DEBUG_FLAGS_DECL;
7089
7090     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7091
7092     if (rx && RXp_PAREN_NAMES(rx)) {
7093         HV *hv = RXp_PAREN_NAMES(rx);
7094         HE *temphe;
7095         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7096             IV i;
7097             IV parno = 0;
7098             SV* sv_dat = HeVAL(temphe);
7099             I32 *nums = (I32*)SvPVX(sv_dat);
7100             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7101                 if ((I32)(rx->lastparen) >= nums[i] &&
7102                     rx->offs[nums[i]].start != -1 &&
7103                     rx->offs[nums[i]].end != -1)
7104                 {
7105                     parno = nums[i];
7106                     break;
7107                 }
7108             }
7109             if (parno || flags & RXapif_ALL) {
7110                 return newSVhek(HeKEY_hek(temphe));
7111             }
7112         }
7113     }
7114     return NULL;
7115 }
7116
7117 SV*
7118 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7119 {
7120     SV *ret;
7121     AV *av;
7122     SSize_t length;
7123     struct regexp *const rx = ReANY(r);
7124
7125     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7126
7127     if (rx && RXp_PAREN_NAMES(rx)) {
7128         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7129             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7130         } else if (flags & RXapif_ONE) {
7131             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7132             av = MUTABLE_AV(SvRV(ret));
7133             length = av_len(av);
7134             SvREFCNT_dec_NN(ret);
7135             return newSViv(length + 1);
7136         } else {
7137             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
7138             return NULL;
7139         }
7140     }
7141     return &PL_sv_undef;
7142 }
7143
7144 SV*
7145 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7146 {
7147     struct regexp *const rx = ReANY(r);
7148     AV *av = newAV();
7149
7150     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7151
7152     if (rx && RXp_PAREN_NAMES(rx)) {
7153         HV *hv= RXp_PAREN_NAMES(rx);
7154         HE *temphe;
7155         (void)hv_iterinit(hv);
7156         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7157             IV i;
7158             IV parno = 0;
7159             SV* sv_dat = HeVAL(temphe);
7160             I32 *nums = (I32*)SvPVX(sv_dat);
7161             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7162                 if ((I32)(rx->lastparen) >= nums[i] &&
7163                     rx->offs[nums[i]].start != -1 &&
7164                     rx->offs[nums[i]].end != -1)
7165                 {
7166                     parno = nums[i];
7167                     break;
7168                 }
7169             }
7170             if (parno || flags & RXapif_ALL) {
7171                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7172             }
7173         }
7174     }
7175
7176     return newRV_noinc(MUTABLE_SV(av));
7177 }
7178
7179 void
7180 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7181                              SV * const sv)
7182 {
7183     struct regexp *const rx = ReANY(r);
7184     char *s = NULL;
7185     SSize_t i = 0;
7186     SSize_t s1, t1;
7187     I32 n = paren;
7188
7189     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7190
7191     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7192            || n == RX_BUFF_IDX_CARET_FULLMATCH
7193            || n == RX_BUFF_IDX_CARET_POSTMATCH
7194        )
7195     {
7196         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7197         if (!keepcopy) {
7198             /* on something like
7199              *    $r = qr/.../;
7200              *    /$qr/p;
7201              * the KEEPCOPY is set on the PMOP rather than the regex */
7202             if (PL_curpm && r == PM_GETRE(PL_curpm))
7203                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7204         }
7205         if (!keepcopy)
7206             goto ret_undef;
7207     }
7208
7209     if (!rx->subbeg)
7210         goto ret_undef;
7211
7212     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7213         /* no need to distinguish between them any more */
7214         n = RX_BUFF_IDX_FULLMATCH;
7215
7216     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7217         && rx->offs[0].start != -1)
7218     {
7219         /* $`, ${^PREMATCH} */
7220         i = rx->offs[0].start;
7221         s = rx->subbeg;
7222     }
7223     else
7224     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7225         && rx->offs[0].end != -1)
7226     {
7227         /* $', ${^POSTMATCH} */
7228         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7229         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7230     }
7231     else
7232     if ( 0 <= n && n <= (I32)rx->nparens &&
7233         (s1 = rx->offs[n].start) != -1 &&
7234         (t1 = rx->offs[n].end) != -1)
7235     {
7236         /* $&, ${^MATCH},  $1 ... */
7237         i = t1 - s1;
7238         s = rx->subbeg + s1 - rx->suboffset;
7239     } else {
7240         goto ret_undef;
7241     }
7242
7243     assert(s >= rx->subbeg);
7244     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7245     if (i >= 0) {
7246 #if NO_TAINT_SUPPORT
7247         sv_setpvn(sv, s, i);
7248 #else
7249         const int oldtainted = TAINT_get;
7250         TAINT_NOT;
7251         sv_setpvn(sv, s, i);
7252         TAINT_set(oldtainted);
7253 #endif
7254         if ( (rx->extflags & RXf_CANY_SEEN)
7255             ? (RXp_MATCH_UTF8(rx)
7256                         && (!i || is_utf8_string((U8*)s, i)))
7257             : (RXp_MATCH_UTF8(rx)) )
7258         {
7259             SvUTF8_on(sv);
7260         }
7261         else
7262             SvUTF8_off(sv);
7263         if (TAINTING_get) {
7264             if (RXp_MATCH_TAINTED(rx)) {
7265                 if (SvTYPE(sv) >= SVt_PVMG) {
7266                     MAGIC* const mg = SvMAGIC(sv);
7267                     MAGIC* mgt;
7268                     TAINT;
7269                     SvMAGIC_set(sv, mg->mg_moremagic);
7270                     SvTAINT(sv);
7271                     if ((mgt = SvMAGIC(sv))) {
7272                         mg->mg_moremagic = mgt;
7273                         SvMAGIC_set(sv, mg);
7274                     }
7275                 } else {
7276                     TAINT;
7277                     SvTAINT(sv);
7278                 }
7279             } else
7280                 SvTAINTED_off(sv);
7281         }
7282     } else {
7283       ret_undef:
7284         sv_setsv(sv,&PL_sv_undef);
7285         return;
7286     }
7287 }
7288
7289 void
7290 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7291                                                          SV const * const value)
7292 {
7293     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7294
7295     PERL_UNUSED_ARG(rx);
7296     PERL_UNUSED_ARG(paren);
7297     PERL_UNUSED_ARG(value);
7298
7299     if (!PL_localizing)
7300         Perl_croak_no_modify();
7301 }
7302
7303 I32
7304 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7305                               const I32 paren)
7306 {
7307     struct regexp *const rx = ReANY(r);
7308     I32 i;
7309     I32 s1, t1;
7310
7311     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7312
7313     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7314         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7315         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7316     )
7317     {
7318         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7319         if (!keepcopy) {
7320             /* on something like
7321              *    $r = qr/.../;
7322              *    /$qr/p;
7323              * the KEEPCOPY is set on the PMOP rather than the regex */
7324             if (PL_curpm && r == PM_GETRE(PL_curpm))
7325                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7326         }
7327         if (!keepcopy)
7328             goto warn_undef;
7329     }
7330
7331     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7332     switch (paren) {
7333       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7334       case RX_BUFF_IDX_PREMATCH:       /* $` */
7335         if (rx->offs[0].start != -1) {
7336                         i = rx->offs[0].start;
7337                         if (i > 0) {
7338                                 s1 = 0;
7339                                 t1 = i;
7340                                 goto getlen;
7341                         }
7342             }
7343         return 0;
7344
7345       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7346       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7347             if (rx->offs[0].end != -1) {
7348                         i = rx->sublen - rx->offs[0].end;
7349                         if (i > 0) {
7350                                 s1 = rx->offs[0].end;
7351                                 t1 = rx->sublen;
7352                                 goto getlen;
7353                         }
7354             }
7355         return 0;
7356
7357       default: /* $& / ${^MATCH}, $1, $2, ... */
7358             if (paren <= (I32)rx->nparens &&
7359             (s1 = rx->offs[paren].start) != -1 &&
7360             (t1 = rx->offs[paren].end) != -1)
7361             {
7362             i = t1 - s1;
7363             goto getlen;
7364         } else {
7365           warn_undef:
7366             if (ckWARN(WARN_UNINITIALIZED))
7367                 report_uninit((const SV *)sv);
7368             return 0;
7369         }
7370     }
7371   getlen:
7372     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7373         const char * const s = rx->subbeg - rx->suboffset + s1;
7374         const U8 *ep;
7375         STRLEN el;
7376
7377         i = t1 - s1;
7378         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7379                         i = el;
7380     }
7381     return i;
7382 }
7383
7384 SV*
7385 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7386 {
7387     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7388         PERL_UNUSED_ARG(rx);
7389         if (0)
7390             return NULL;
7391         else
7392             return newSVpvs("Regexp");
7393 }
7394
7395 /* Scans the name of a named buffer from the pattern.
7396  * If flags is REG_RSN_RETURN_NULL returns null.
7397  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7398  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7399  * to the parsed name as looked up in the RExC_paren_names hash.
7400  * If there is an error throws a vFAIL().. type exception.
7401  */
7402
7403 #define REG_RSN_RETURN_NULL    0
7404 #define REG_RSN_RETURN_NAME    1
7405 #define REG_RSN_RETURN_DATA    2
7406
7407 STATIC SV*
7408 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7409 {
7410     char *name_start = RExC_parse;
7411
7412     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7413
7414     assert (RExC_parse <= RExC_end);
7415     if (RExC_parse == RExC_end) NOOP;
7416     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7417          /* skip IDFIRST by using do...while */
7418         if (UTF)
7419             do {
7420                 RExC_parse += UTF8SKIP(RExC_parse);
7421             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7422         else
7423             do {
7424                 RExC_parse++;
7425             } while (isWORDCHAR(*RExC_parse));
7426     } else {
7427         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7428         vFAIL("Group name must start with a non-digit word character");
7429     }
7430     if ( flags ) {
7431         SV* sv_name
7432             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7433                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7434         if ( flags == REG_RSN_RETURN_NAME)
7435             return sv_name;
7436         else if (flags==REG_RSN_RETURN_DATA) {
7437             HE *he_str = NULL;
7438             SV *sv_dat = NULL;
7439             if ( ! sv_name )      /* should not happen*/
7440                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7441             if (RExC_paren_names)
7442                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7443             if ( he_str )
7444                 sv_dat = HeVAL(he_str);
7445             if ( ! sv_dat )
7446                 vFAIL("Reference to nonexistent named group");
7447             return sv_dat;
7448         }
7449         else {
7450             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7451                        (unsigned long) flags);
7452         }
7453         assert(0); /* NOT REACHED */
7454     }
7455     return NULL;
7456 }
7457
7458 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7459     int rem=(int)(RExC_end - RExC_parse);                       \
7460     int cut;                                                    \
7461     int num;                                                    \
7462     int iscut=0;                                                \
7463     if (rem>10) {                                               \
7464         rem=10;                                                 \
7465         iscut=1;                                                \
7466     }                                                           \
7467     cut=10-rem;                                                 \
7468     if (RExC_lastparse!=RExC_parse)                             \
7469         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7470             rem, RExC_parse,                                    \
7471             cut + 4,                                            \
7472             iscut ? "..." : "<"                                 \
7473         );                                                      \
7474     else                                                        \
7475         PerlIO_printf(Perl_debug_log,"%16s","");                \
7476                                                                 \
7477     if (SIZE_ONLY)                                              \
7478        num = RExC_size + 1;                                     \
7479     else                                                        \
7480        num=REG_NODE_NUM(RExC_emit);                             \
7481     if (RExC_lastnum!=num)                                      \
7482        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7483     else                                                        \
7484        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7485     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7486         (int)((depth*2)), "",                                   \
7487         (funcname)                                              \
7488     );                                                          \
7489     RExC_lastnum=num;                                           \
7490     RExC_lastparse=RExC_parse;                                  \
7491 })
7492
7493
7494
7495 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7496     DEBUG_PARSE_MSG((funcname));                            \
7497     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7498 })
7499 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7500     DEBUG_PARSE_MSG((funcname));                            \
7501     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7502 })
7503
7504 /* This section of code defines the inversion list object and its methods.  The
7505  * interfaces are highly subject to change, so as much as possible is static to
7506  * this file.  An inversion list is here implemented as a malloc'd C UV array
7507  * as an SVt_INVLIST scalar.
7508  *
7509  * An inversion list for Unicode is an array of code points, sorted by ordinal
7510  * number.  The zeroth element is the first code point in the list.  The 1th
7511  * element is the first element beyond that not in the list.  In other words,
7512  * the first range is
7513  *  invlist[0]..(invlist[1]-1)
7514  * The other ranges follow.  Thus every element whose index is divisible by two
7515  * marks the beginning of a range that is in the list, and every element not
7516  * divisible by two marks the beginning of a range not in the list.  A single
7517  * element inversion list that contains the single code point N generally
7518  * consists of two elements
7519  *  invlist[0] == N
7520  *  invlist[1] == N+1
7521  * (The exception is when N is the highest representable value on the
7522  * machine, in which case the list containing just it would be a single
7523  * element, itself.  By extension, if the last range in the list extends to
7524  * infinity, then the first element of that range will be in the inversion list
7525  * at a position that is divisible by two, and is the final element in the
7526  * list.)
7527  * Taking the complement (inverting) an inversion list is quite simple, if the
7528  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7529  * This implementation reserves an element at the beginning of each inversion
7530  * list to always contain 0; there is an additional flag in the header which
7531  * indicates if the list begins at the 0, or is offset to begin at the next
7532  * element.
7533  *
7534  * More about inversion lists can be found in "Unicode Demystified"
7535  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7536  * More will be coming when functionality is added later.
7537  *
7538  * The inversion list data structure is currently implemented as an SV pointing
7539  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7540  * array of UV whose memory management is automatically handled by the existing
7541  * facilities for SV's.
7542  *
7543  * Some of the methods should always be private to the implementation, and some
7544  * should eventually be made public */
7545
7546 /* The header definitions are in F<inline_invlist.c> */
7547
7548 PERL_STATIC_INLINE UV*
7549 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7550 {
7551     /* Returns a pointer to the first element in the inversion list's array.
7552      * This is called upon initialization of an inversion list.  Where the
7553      * array begins depends on whether the list has the code point U+0000 in it
7554      * or not.  The other parameter tells it whether the code that follows this
7555      * call is about to put a 0 in the inversion list or not.  The first
7556      * element is either the element reserved for 0, if TRUE, or the element
7557      * after it, if FALSE */
7558
7559     bool* offset = get_invlist_offset_addr(invlist);
7560     UV* zero_addr = (UV *) SvPVX(invlist);
7561
7562     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7563
7564     /* Must be empty */
7565     assert(! _invlist_len(invlist));
7566
7567     *zero_addr = 0;
7568
7569     /* 1^1 = 0; 1^0 = 1 */
7570     *offset = 1 ^ will_have_0;
7571     return zero_addr + *offset;
7572 }
7573
7574 PERL_STATIC_INLINE UV*
7575 S_invlist_array(pTHX_ SV* const invlist)
7576 {
7577     /* Returns the pointer to the inversion list's array.  Every time the
7578      * length changes, this needs to be called in case malloc or realloc moved
7579      * it */
7580
7581     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7582
7583     /* Must not be empty.  If these fail, you probably didn't check for <len>
7584      * being non-zero before trying to get the array */
7585     assert(_invlist_len(invlist));
7586
7587     /* The very first element always contains zero, The array begins either
7588      * there, or if the inversion list is offset, at the element after it.
7589      * The offset header field determines which; it contains 0 or 1 to indicate
7590      * how much additionally to add */
7591     assert(0 == *(SvPVX(invlist)));
7592     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7593 }
7594
7595 PERL_STATIC_INLINE void
7596 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7597 {
7598     /* Sets the current number of elements stored in the inversion list.
7599      * Updates SvCUR correspondingly */
7600
7601     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7602
7603     assert(SvTYPE(invlist) == SVt_INVLIST);
7604
7605     SvCUR_set(invlist,
7606               (len == 0)
7607                ? 0
7608                : TO_INTERNAL_SIZE(len + offset));
7609     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7610 }
7611
7612 PERL_STATIC_INLINE IV*
7613 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7614 {
7615     /* Return the address of the IV that is reserved to hold the cached index
7616      * */
7617
7618     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7619
7620     assert(SvTYPE(invlist) == SVt_INVLIST);
7621
7622     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7623 }
7624
7625 PERL_STATIC_INLINE IV
7626 S_invlist_previous_index(pTHX_ SV* const invlist)
7627 {
7628     /* Returns cached index of previous search */
7629
7630     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7631
7632     return *get_invlist_previous_index_addr(invlist);
7633 }
7634
7635 PERL_STATIC_INLINE void
7636 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7637 {
7638     /* Caches <index> for later retrieval */
7639
7640     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7641
7642     assert(index == 0 || index < (int) _invlist_len(invlist));
7643
7644     *get_invlist_previous_index_addr(invlist) = index;
7645 }
7646
7647 PERL_STATIC_INLINE UV
7648 S_invlist_max(pTHX_ SV* const invlist)
7649 {
7650     /* Returns the maximum number of elements storable in the inversion list's
7651      * array, without having to realloc() */
7652
7653     PERL_ARGS_ASSERT_INVLIST_MAX;
7654
7655     assert(SvTYPE(invlist) == SVt_INVLIST);
7656
7657     /* Assumes worst case, in which the 0 element is not counted in the
7658      * inversion list, so subtracts 1 for that */
7659     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7660            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7661            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7662 }
7663
7664 #ifndef PERL_IN_XSUB_RE
7665 SV*
7666 Perl__new_invlist(pTHX_ IV initial_size)
7667 {
7668
7669     /* Return a pointer to a newly constructed inversion list, with enough
7670      * space to store 'initial_size' elements.  If that number is negative, a
7671      * system default is used instead */
7672
7673     SV* new_list;
7674
7675     if (initial_size < 0) {
7676         initial_size = 10;
7677     }
7678
7679     /* Allocate the initial space */
7680     new_list = newSV_type(SVt_INVLIST);
7681
7682     /* First 1 is in case the zero element isn't in the list; second 1 is for
7683      * trailing NUL */
7684     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7685     invlist_set_len(new_list, 0, 0);
7686
7687     /* Force iterinit() to be used to get iteration to work */
7688     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7689
7690     *get_invlist_previous_index_addr(new_list) = 0;
7691
7692     return new_list;
7693 }
7694 #endif
7695
7696 STATIC SV*
7697 S__new_invlist_C_array(pTHX_ const UV* const list)
7698 {
7699     /* Return a pointer to a newly constructed inversion list, initialized to
7700      * point to <list>, which has to be in the exact correct inversion list
7701      * form, including internal fields.  Thus this is a dangerous routine that
7702      * should not be used in the wrong hands.  The passed in 'list' contains
7703      * several header fields at the beginning that are not part of the
7704      * inversion list body proper */
7705
7706     const STRLEN length = (STRLEN) list[0];
7707     const UV version_id =          list[1];
7708     const bool offset   =    cBOOL(list[2]);
7709 #define HEADER_LENGTH 3
7710     /* If any of the above changes in any way, you must change HEADER_LENGTH
7711      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7712      *      perl -E 'say int(rand 2**31-1)'
7713      */
7714 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7715                                         data structure type, so that one being
7716                                         passed in can be validated to be an
7717                                         inversion list of the correct vintage.
7718                                        */
7719
7720     SV* invlist = newSV_type(SVt_INVLIST);
7721
7722     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7723
7724     if (version_id != INVLIST_VERSION_ID) {
7725         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7726     }
7727
7728     /* The generated array passed in includes header elements that aren't part
7729      * of the list proper, so start it just after them */
7730     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7731
7732     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7733                                shouldn't touch it */
7734
7735     *(get_invlist_offset_addr(invlist)) = offset;
7736
7737     /* The 'length' passed to us is the physical number of elements in the
7738      * inversion list.  But if there is an offset the logical number is one
7739      * less than that */
7740     invlist_set_len(invlist, length  - offset, offset);
7741
7742     invlist_set_previous_index(invlist, 0);
7743
7744     /* Initialize the iteration pointer. */
7745     invlist_iterfinish(invlist);
7746
7747     return invlist;
7748 }
7749
7750 STATIC void
7751 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7752 {
7753     /* Grow the maximum size of an inversion list */
7754
7755     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7756
7757     assert(SvTYPE(invlist) == SVt_INVLIST);
7758
7759     /* Add one to account for the zero element at the beginning which may not
7760      * be counted by the calling parameters */
7761     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7762 }
7763
7764 PERL_STATIC_INLINE void
7765 S_invlist_trim(pTHX_ SV* const invlist)
7766 {
7767     PERL_ARGS_ASSERT_INVLIST_TRIM;
7768
7769     assert(SvTYPE(invlist) == SVt_INVLIST);
7770
7771     /* Change the length of the inversion list to how many entries it currently
7772      * has */
7773     SvPV_shrink_to_cur((SV *) invlist);
7774 }
7775
7776 STATIC void
7777 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7778 {
7779    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7780     * the end of the inversion list.  The range must be above any existing
7781     * ones. */
7782
7783     UV* array;
7784     UV max = invlist_max(invlist);
7785     UV len = _invlist_len(invlist);
7786     bool offset;
7787
7788     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7789
7790     if (len == 0) { /* Empty lists must be initialized */
7791         offset = start != 0;
7792         array = _invlist_array_init(invlist, ! offset);
7793     }
7794     else {
7795         /* Here, the existing list is non-empty. The current max entry in the
7796          * list is generally the first value not in the set, except when the
7797          * set extends to the end of permissible values, in which case it is
7798          * the first entry in that final set, and so this call is an attempt to
7799          * append out-of-order */
7800
7801         UV final_element = len - 1;
7802         array = invlist_array(invlist);
7803         if (array[final_element] > start
7804             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7805         {
7806             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7807                        array[final_element], start,
7808                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7809         }
7810
7811         /* Here, it is a legal append.  If the new range begins with the first
7812          * value not in the set, it is extending the set, so the new first
7813          * value not in the set is one greater than the newly extended range.
7814          * */
7815         offset = *get_invlist_offset_addr(invlist);
7816         if (array[final_element] == start) {
7817             if (end != UV_MAX) {
7818                 array[final_element] = end + 1;
7819             }
7820             else {
7821                 /* But if the end is the maximum representable on the machine,
7822                  * just let the range that this would extend to have no end */
7823                 invlist_set_len(invlist, len - 1, offset);
7824             }
7825             return;
7826         }
7827     }
7828
7829     /* Here the new range doesn't extend any existing set.  Add it */
7830
7831     len += 2;   /* Includes an element each for the start and end of range */
7832
7833     /* If wll overflow the existing space, extend, which may cause the array to
7834      * be moved */
7835     if (max < len) {
7836         invlist_extend(invlist, len);
7837
7838         /* Have to set len here to avoid assert failure in invlist_array() */
7839         invlist_set_len(invlist, len, offset);
7840
7841         array = invlist_array(invlist);
7842     }
7843     else {
7844         invlist_set_len(invlist, len, offset);
7845     }
7846
7847     /* The next item on the list starts the range, the one after that is
7848      * one past the new range.  */
7849     array[len - 2] = start;
7850     if (end != UV_MAX) {
7851         array[len - 1] = end + 1;
7852     }
7853     else {
7854         /* But if the end is the maximum representable on the machine, just let
7855          * the range have no end */
7856         invlist_set_len(invlist, len - 1, offset);
7857     }
7858 }
7859
7860 #ifndef PERL_IN_XSUB_RE
7861
7862 IV
7863 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7864 {
7865     /* Searches the inversion list for the entry that contains the input code
7866      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7867      * return value is the index into the list's array of the range that
7868      * contains <cp> */
7869
7870     IV low = 0;
7871     IV mid;
7872     IV high = _invlist_len(invlist);
7873     const IV highest_element = high - 1;
7874     const UV* array;
7875
7876     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7877
7878     /* If list is empty, return failure. */
7879     if (high == 0) {
7880         return -1;
7881     }
7882
7883     /* (We can't get the array unless we know the list is non-empty) */
7884     array = invlist_array(invlist);
7885
7886     mid = invlist_previous_index(invlist);
7887     assert(mid >=0 && mid <= highest_element);
7888
7889     /* <mid> contains the cache of the result of the previous call to this
7890      * function (0 the first time).  See if this call is for the same result,
7891      * or if it is for mid-1.  This is under the theory that calls to this
7892      * function will often be for related code points that are near each other.
7893      * And benchmarks show that caching gives better results.  We also test
7894      * here if the code point is within the bounds of the list.  These tests
7895      * replace others that would have had to be made anyway to make sure that
7896      * the array bounds were not exceeded, and these give us extra information
7897      * at the same time */
7898     if (cp >= array[mid]) {
7899         if (cp >= array[highest_element]) {
7900             return highest_element;
7901         }
7902
7903         /* Here, array[mid] <= cp < array[highest_element].  This means that
7904          * the final element is not the answer, so can exclude it; it also
7905          * means that <mid> is not the final element, so can refer to 'mid + 1'
7906          * safely */
7907         if (cp < array[mid + 1]) {
7908             return mid;
7909         }
7910         high--;
7911         low = mid + 1;
7912     }
7913     else { /* cp < aray[mid] */
7914         if (cp < array[0]) { /* Fail if outside the array */
7915             return -1;
7916         }
7917         high = mid;
7918         if (cp >= array[mid - 1]) {
7919             goto found_entry;
7920         }
7921     }
7922
7923     /* Binary search.  What we are looking for is <i> such that
7924      *  array[i] <= cp < array[i+1]
7925      * The loop below converges on the i+1.  Note that there may not be an
7926      * (i+1)th element in the array, and things work nonetheless */
7927     while (low < high) {
7928         mid = (low + high) / 2;
7929         assert(mid <= highest_element);
7930         if (array[mid] <= cp) { /* cp >= array[mid] */
7931             low = mid + 1;
7932
7933             /* We could do this extra test to exit the loop early.
7934             if (cp < array[low]) {
7935                 return mid;
7936             }
7937             */
7938         }
7939         else { /* cp < array[mid] */
7940             high = mid;
7941         }
7942     }
7943
7944   found_entry:
7945     high--;
7946     invlist_set_previous_index(invlist, high);
7947     return high;
7948 }
7949
7950 void
7951 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7952 {
7953     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7954      * but is used when the swash has an inversion list.  This makes this much
7955      * faster, as it uses a binary search instead of a linear one.  This is
7956      * intimately tied to that function, and perhaps should be in utf8.c,
7957      * except it is intimately tied to inversion lists as well.  It assumes
7958      * that <swatch> is all 0's on input */
7959
7960     UV current = start;
7961     const IV len = _invlist_len(invlist);
7962     IV i;
7963     const UV * array;
7964
7965     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7966
7967     if (len == 0) { /* Empty inversion list */
7968         return;
7969     }
7970
7971     array = invlist_array(invlist);
7972
7973     /* Find which element it is */
7974     i = _invlist_search(invlist, start);
7975
7976     /* We populate from <start> to <end> */
7977     while (current < end) {
7978         UV upper;
7979
7980         /* The inversion list gives the results for every possible code point
7981          * after the first one in the list.  Only those ranges whose index is
7982          * even are ones that the inversion list matches.  For the odd ones,
7983          * and if the initial code point is not in the list, we have to skip
7984          * forward to the next element */
7985         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7986             i++;
7987             if (i >= len) { /* Finished if beyond the end of the array */
7988                 return;
7989             }
7990             current = array[i];
7991             if (current >= end) {   /* Finished if beyond the end of what we
7992                                        are populating */
7993                 if (LIKELY(end < UV_MAX)) {
7994                     return;
7995                 }
7996
7997                 /* We get here when the upper bound is the maximum
7998                  * representable on the machine, and we are looking for just
7999                  * that code point.  Have to special case it */
8000                 i = len;
8001                 goto join_end_of_list;
8002             }
8003         }
8004         assert(current >= start);
8005
8006         /* The current range ends one below the next one, except don't go past
8007          * <end> */
8008         i++;
8009         upper = (i < len && array[i] < end) ? array[i] : end;
8010
8011         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8012          * for each code point in it */
8013         for (; current < upper; current++) {
8014             const STRLEN offset = (STRLEN)(current - start);
8015             swatch[offset >> 3] |= 1 << (offset & 7);
8016         }
8017
8018     join_end_of_list:
8019
8020         /* Quit if at the end of the list */
8021         if (i >= len) {
8022
8023             /* But first, have to deal with the highest possible code point on
8024              * the platform.  The previous code assumes that <end> is one
8025              * beyond where we want to populate, but that is impossible at the
8026              * platform's infinity, so have to handle it specially */
8027             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8028             {
8029                 const STRLEN offset = (STRLEN)(end - start);
8030                 swatch[offset >> 3] |= 1 << (offset & 7);
8031             }
8032             return;
8033         }
8034
8035         /* Advance to the next range, which will be for code points not in the
8036          * inversion list */
8037         current = array[i];
8038     }
8039
8040     return;
8041 }
8042
8043 void
8044 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
8045 {
8046     /* Take the union of two inversion lists and point <output> to it.  *output
8047      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8048      * the reference count to that list will be decremented if not already a
8049      * temporary (mortal); otherwise *output will be made correspondingly
8050      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8051      * second list is returned.  If <complement_b> is TRUE, the union is taken
8052      * of the complement (inversion) of <b> instead of b itself.
8053      *
8054      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8055      * Richard Gillam, published by Addison-Wesley, and explained at some
8056      * length there.  The preface says to incorporate its examples into your
8057      * code at your own risk.
8058      *
8059      * The algorithm is like a merge sort.
8060      *
8061      * XXX A potential performance improvement is to keep track as we go along
8062      * if only one of the inputs contributes to the result, meaning the other
8063      * is a subset of that one.  In that case, we can skip the final copy and
8064      * return the larger of the input lists, but then outside code might need
8065      * to keep track of whether to free the input list or not */
8066
8067     const UV* array_a;    /* a's array */
8068     const UV* array_b;
8069     UV len_a;       /* length of a's array */
8070     UV len_b;
8071
8072     SV* u;                      /* the resulting union */
8073     UV* array_u;
8074     UV len_u;
8075
8076     UV i_a = 0;             /* current index into a's array */
8077     UV i_b = 0;
8078     UV i_u = 0;
8079
8080     /* running count, as explained in the algorithm source book; items are
8081      * stopped accumulating and are output when the count changes to/from 0.
8082      * The count is incremented when we start a range that's in the set, and
8083      * decremented when we start a range that's not in the set.  So its range
8084      * is 0 to 2.  Only when the count is zero is something not in the set.
8085      */
8086     UV count = 0;
8087
8088     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8089     assert(a != b);
8090
8091     /* If either one is empty, the union is the other one */
8092     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8093         bool make_temp = FALSE; /* Should we mortalize the result? */
8094
8095         if (*output == a) {
8096             if (a != NULL) {
8097                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8098                     SvREFCNT_dec_NN(a);
8099                 }
8100             }
8101         }
8102         if (*output != b) {
8103             *output = invlist_clone(b);
8104             if (complement_b) {
8105                 _invlist_invert(*output);
8106             }
8107         } /* else *output already = b; */
8108
8109         if (make_temp) {
8110             sv_2mortal(*output);
8111         }
8112         return;
8113     }
8114     else if ((len_b = _invlist_len(b)) == 0) {
8115         bool make_temp = FALSE;
8116         if (*output == b) {
8117             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8118                 SvREFCNT_dec_NN(b);
8119             }
8120         }
8121
8122         /* The complement of an empty list is a list that has everything in it,
8123          * so the union with <a> includes everything too */
8124         if (complement_b) {
8125             if (a == *output) {
8126                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8127                     SvREFCNT_dec_NN(a);
8128                 }
8129             }
8130             *output = _new_invlist(1);
8131             _append_range_to_invlist(*output, 0, UV_MAX);
8132         }
8133         else if (*output != a) {
8134             *output = invlist_clone(a);
8135         }
8136         /* else *output already = a; */
8137
8138         if (make_temp) {
8139             sv_2mortal(*output);
8140         }
8141         return;
8142     }
8143
8144     /* Here both lists exist and are non-empty */
8145     array_a = invlist_array(a);
8146     array_b = invlist_array(b);
8147
8148     /* If are to take the union of 'a' with the complement of b, set it
8149      * up so are looking at b's complement. */
8150     if (complement_b) {
8151
8152         /* To complement, we invert: if the first element is 0, remove it.  To
8153          * do this, we just pretend the array starts one later */
8154         if (array_b[0] == 0) {
8155             array_b++;
8156             len_b--;
8157         }
8158         else {
8159
8160             /* But if the first element is not zero, we pretend the list starts
8161              * at the 0 that is always stored immediately before the array. */
8162             array_b--;
8163             len_b++;
8164         }
8165     }
8166
8167     /* Size the union for the worst case: that the sets are completely
8168      * disjoint */
8169     u = _new_invlist(len_a + len_b);
8170
8171     /* Will contain U+0000 if either component does */
8172     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8173                                       || (len_b > 0 && array_b[0] == 0));
8174
8175     /* Go through each list item by item, stopping when exhausted one of
8176      * them */
8177     while (i_a < len_a && i_b < len_b) {
8178         UV cp;      /* The element to potentially add to the union's array */
8179         bool cp_in_set;   /* is it in the the input list's set or not */
8180
8181         /* We need to take one or the other of the two inputs for the union.
8182          * Since we are merging two sorted lists, we take the smaller of the
8183          * next items.  In case of a tie, we take the one that is in its set
8184          * first.  If we took one not in the set first, it would decrement the
8185          * count, possibly to 0 which would cause it to be output as ending the
8186          * range, and the next time through we would take the same number, and
8187          * output it again as beginning the next range.  By doing it the
8188          * opposite way, there is no possibility that the count will be
8189          * momentarily decremented to 0, and thus the two adjoining ranges will
8190          * be seamlessly merged.  (In a tie and both are in the set or both not
8191          * in the set, it doesn't matter which we take first.) */
8192         if (array_a[i_a] < array_b[i_b]
8193             || (array_a[i_a] == array_b[i_b]
8194                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8195         {
8196             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8197             cp= array_a[i_a++];
8198         }
8199         else {
8200             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8201             cp = array_b[i_b++];
8202         }
8203
8204         /* Here, have chosen which of the two inputs to look at.  Only output
8205          * if the running count changes to/from 0, which marks the
8206          * beginning/end of a range in that's in the set */
8207         if (cp_in_set) {
8208             if (count == 0) {
8209                 array_u[i_u++] = cp;
8210             }
8211             count++;
8212         }
8213         else {
8214             count--;
8215             if (count == 0) {
8216                 array_u[i_u++] = cp;
8217             }
8218         }
8219     }
8220
8221     /* Here, we are finished going through at least one of the lists, which
8222      * means there is something remaining in at most one.  We check if the list
8223      * that hasn't been exhausted is positioned such that we are in the middle
8224      * of a range in its set or not.  (i_a and i_b point to the element beyond
8225      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8226      * is potentially more to output.
8227      * There are four cases:
8228      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8229      *     in the union is entirely from the non-exhausted set.
8230      *  2) Both were in their sets, count is 2.  Nothing further should
8231      *     be output, as everything that remains will be in the exhausted
8232      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8233      *     that
8234      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8235      *     Nothing further should be output because the union includes
8236      *     everything from the exhausted set.  Not decrementing ensures that.
8237      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8238      *     decrementing to 0 insures that we look at the remainder of the
8239      *     non-exhausted set */
8240     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8241         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8242     {
8243         count--;
8244     }
8245
8246     /* The final length is what we've output so far, plus what else is about to
8247      * be output.  (If 'count' is non-zero, then the input list we exhausted
8248      * has everything remaining up to the machine's limit in its set, and hence
8249      * in the union, so there will be no further output. */
8250     len_u = i_u;
8251     if (count == 0) {
8252         /* At most one of the subexpressions will be non-zero */
8253         len_u += (len_a - i_a) + (len_b - i_b);
8254     }
8255
8256     /* Set result to final length, which can change the pointer to array_u, so
8257      * re-find it */
8258     if (len_u != _invlist_len(u)) {
8259         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8260         invlist_trim(u);
8261         array_u = invlist_array(u);
8262     }
8263
8264     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8265      * the other) ended with everything above it not in its set.  That means
8266      * that the remaining part of the union is precisely the same as the
8267      * non-exhausted list, so can just copy it unchanged.  (If both list were
8268      * exhausted at the same time, then the operations below will be both 0.)
8269      */
8270     if (count == 0) {
8271         IV copy_count; /* At most one will have a non-zero copy count */
8272         if ((copy_count = len_a - i_a) > 0) {
8273             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8274         }
8275         else if ((copy_count = len_b - i_b) > 0) {
8276             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8277         }
8278     }
8279
8280     /*  We may be removing a reference to one of the inputs.  If so, the output
8281      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8282      *  count decremented) */
8283     if (a == *output || b == *output) {
8284         assert(! invlist_is_iterating(*output));
8285         if ((SvTEMP(*output))) {
8286             sv_2mortal(u);
8287         }
8288         else {
8289             SvREFCNT_dec_NN(*output);
8290         }
8291     }
8292
8293     *output = u;
8294
8295     return;
8296 }
8297
8298 void
8299 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8300 {
8301     /* Take the intersection of two inversion lists and point <i> to it.  *i
8302      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8303      * the reference count to that list will be decremented if not already a
8304      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8305      * The first list, <a>, may be NULL, in which case an empty list is
8306      * returned.  If <complement_b> is TRUE, the result will be the
8307      * intersection of <a> and the complement (or inversion) of <b> instead of
8308      * <b> directly.
8309      *
8310      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8311      * Richard Gillam, published by Addison-Wesley, and explained at some
8312      * length there.  The preface says to incorporate its examples into your
8313      * code at your own risk.  In fact, it had bugs
8314      *
8315      * The algorithm is like a merge sort, and is essentially the same as the
8316      * union above
8317      */
8318
8319     const UV* array_a;          /* a's array */
8320     const UV* array_b;
8321     UV len_a;   /* length of a's array */
8322     UV len_b;
8323
8324     SV* r;                   /* the resulting intersection */
8325     UV* array_r;
8326     UV len_r;
8327
8328     UV i_a = 0;             /* current index into a's array */
8329     UV i_b = 0;
8330     UV i_r = 0;
8331
8332     /* running count, as explained in the algorithm source book; items are
8333      * stopped accumulating and are output when the count changes to/from 2.
8334      * The count is incremented when we start a range that's in the set, and
8335      * decremented when we start a range that's not in the set.  So its range
8336      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8337      */
8338     UV count = 0;
8339
8340     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8341     assert(a != b);
8342
8343     /* Special case if either one is empty */
8344     len_a = (a == NULL) ? 0 : _invlist_len(a);
8345     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8346         bool make_temp = FALSE;
8347
8348         if (len_a != 0 && complement_b) {
8349
8350             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8351              * be empty.  Here, also we are using 'b's complement, which hence
8352              * must be every possible code point.  Thus the intersection is
8353              * simply 'a'. */
8354             if (*i != a) {
8355                 if (*i == b) {
8356                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8357                         SvREFCNT_dec_NN(b);
8358                     }
8359                 }
8360
8361                 *i = invlist_clone(a);
8362             }
8363             /* else *i is already 'a' */
8364
8365             if (make_temp) {
8366                 sv_2mortal(*i);
8367             }
8368             return;
8369         }
8370
8371         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8372          * intersection must be empty */
8373         if (*i == a) {
8374             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8375                 SvREFCNT_dec_NN(a);
8376             }
8377         }
8378         else if (*i == b) {
8379             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8380                 SvREFCNT_dec_NN(b);
8381             }
8382         }
8383         *i = _new_invlist(0);
8384         if (make_temp) {
8385             sv_2mortal(*i);
8386         }
8387
8388         return;
8389     }
8390
8391     /* Here both lists exist and are non-empty */
8392     array_a = invlist_array(a);
8393     array_b = invlist_array(b);
8394
8395     /* If are to take the intersection of 'a' with the complement of b, set it
8396      * up so are looking at b's complement. */
8397     if (complement_b) {
8398
8399         /* To complement, we invert: if the first element is 0, remove it.  To
8400          * do this, we just pretend the array starts one later */
8401         if (array_b[0] == 0) {
8402             array_b++;
8403             len_b--;
8404         }
8405         else {
8406
8407             /* But if the first element is not zero, we pretend the list starts
8408              * at the 0 that is always stored immediately before the array. */
8409             array_b--;
8410             len_b++;
8411         }
8412     }
8413
8414     /* Size the intersection for the worst case: that the intersection ends up
8415      * fragmenting everything to be completely disjoint */
8416     r= _new_invlist(len_a + len_b);
8417
8418     /* Will contain U+0000 iff both components do */
8419     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8420                                      && len_b > 0 && array_b[0] == 0);
8421
8422     /* Go through each list item by item, stopping when exhausted one of
8423      * them */
8424     while (i_a < len_a && i_b < len_b) {
8425         UV cp;      /* The element to potentially add to the intersection's
8426                        array */
8427         bool cp_in_set; /* Is it in the input list's set or not */
8428
8429         /* We need to take one or the other of the two inputs for the
8430          * intersection.  Since we are merging two sorted lists, we take the
8431          * smaller of the next items.  In case of a tie, we take the one that
8432          * is not in its set first (a difference from the union algorithm).  If
8433          * we took one in the set first, it would increment the count, possibly
8434          * to 2 which would cause it to be output as starting a range in the
8435          * intersection, and the next time through we would take that same
8436          * number, and output it again as ending the set.  By doing it the
8437          * opposite of this, there is no possibility that the count will be
8438          * momentarily incremented to 2.  (In a tie and both are in the set or
8439          * both not in the set, it doesn't matter which we take first.) */
8440         if (array_a[i_a] < array_b[i_b]
8441             || (array_a[i_a] == array_b[i_b]
8442                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8443         {
8444             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8445             cp= array_a[i_a++];
8446         }
8447         else {
8448             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8449             cp= array_b[i_b++];
8450         }
8451
8452         /* Here, have chosen which of the two inputs to look at.  Only output
8453          * if the running count changes to/from 2, which marks the
8454          * beginning/end of a range that's in the intersection */
8455         if (cp_in_set) {
8456             count++;
8457             if (count == 2) {
8458                 array_r[i_r++] = cp;
8459             }
8460         }
8461         else {
8462             if (count == 2) {
8463                 array_r[i_r++] = cp;
8464             }
8465             count--;
8466         }
8467     }
8468
8469     /* Here, we are finished going through at least one of the lists, which
8470      * means there is something remaining in at most one.  We check if the list
8471      * that has been exhausted is positioned such that we are in the middle
8472      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8473      * the ones we care about.)  There are four cases:
8474      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8475      *     nothing left in the intersection.
8476      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8477      *     above 2.  What should be output is exactly that which is in the
8478      *     non-exhausted set, as everything it has is also in the intersection
8479      *     set, and everything it doesn't have can't be in the intersection
8480      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8481      *     gets incremented to 2.  Like the previous case, the intersection is
8482      *     everything that remains in the non-exhausted set.
8483      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8484      *     remains 1.  And the intersection has nothing more. */
8485     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8486         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8487     {
8488         count++;
8489     }
8490
8491     /* The final length is what we've output so far plus what else is in the
8492      * intersection.  At most one of the subexpressions below will be non-zero */
8493     len_r = i_r;
8494     if (count >= 2) {
8495         len_r += (len_a - i_a) + (len_b - i_b);
8496     }
8497
8498     /* Set result to final length, which can change the pointer to array_r, so
8499      * re-find it */
8500     if (len_r != _invlist_len(r)) {
8501         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8502         invlist_trim(r);
8503         array_r = invlist_array(r);
8504     }
8505
8506     /* Finish outputting any remaining */
8507     if (count >= 2) { /* At most one will have a non-zero copy count */
8508         IV copy_count;
8509         if ((copy_count = len_a - i_a) > 0) {
8510             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8511         }
8512         else if ((copy_count = len_b - i_b) > 0) {
8513             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8514         }
8515     }
8516
8517     /*  We may be removing a reference to one of the inputs.  If so, the output
8518      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8519      *  count decremented) */
8520     if (a == *i || b == *i) {
8521         assert(! invlist_is_iterating(*i));
8522         if (SvTEMP(*i)) {
8523             sv_2mortal(r);
8524         }
8525         else {
8526             SvREFCNT_dec_NN(*i);
8527         }
8528     }
8529
8530     *i = r;
8531
8532     return;
8533 }
8534
8535 SV*
8536 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8537 {
8538     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8539      * set.  A pointer to the inversion list is returned.  This may actually be
8540      * a new list, in which case the passed in one has been destroyed.  The
8541      * passed in inversion list can be NULL, in which case a new one is created
8542      * with just the one range in it */
8543
8544     SV* range_invlist;
8545     UV len;
8546
8547     if (invlist == NULL) {
8548         invlist = _new_invlist(2);
8549         len = 0;
8550     }
8551     else {
8552         len = _invlist_len(invlist);
8553     }
8554
8555     /* If comes after the final entry actually in the list, can just append it
8556      * to the end, */
8557     if (len == 0
8558         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8559             && start >= invlist_array(invlist)[len - 1]))
8560     {
8561         _append_range_to_invlist(invlist, start, end);
8562         return invlist;
8563     }
8564
8565     /* Here, can't just append things, create and return a new inversion list
8566      * which is the union of this range and the existing inversion list */
8567     range_invlist = _new_invlist(2);
8568     _append_range_to_invlist(range_invlist, start, end);
8569
8570     _invlist_union(invlist, range_invlist, &invlist);
8571
8572     /* The temporary can be freed */
8573     SvREFCNT_dec_NN(range_invlist);
8574
8575     return invlist;
8576 }
8577
8578 SV*
8579 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr)
8580 {
8581     /* Create and return an inversion list whose contents are to be populated
8582      * by the caller.  The caller gives the number of elements (in 'size') and
8583      * the very first element ('element0').  This function will set
8584      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8585      * are to be placed.
8586      *
8587      * Obviously there is some trust involved that the caller will properly
8588      * fill in the other elements of the array.
8589      *
8590      * (The first element needs to be passed in, as the underlying code does
8591      * things differently depending on whether it is zero or non-zero) */
8592
8593     SV* invlist = _new_invlist(size);
8594     bool offset;
8595
8596     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8597
8598     _append_range_to_invlist(invlist, element0, element0);
8599     offset = *get_invlist_offset_addr(invlist);
8600
8601     invlist_set_len(invlist, size, offset);
8602     *other_elements_ptr = invlist_array(invlist) + 1;
8603     return invlist;
8604 }
8605
8606 #endif
8607
8608 PERL_STATIC_INLINE SV*
8609 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8610     return _add_range_to_invlist(invlist, cp, cp);
8611 }
8612
8613 #ifndef PERL_IN_XSUB_RE
8614 void
8615 Perl__invlist_invert(pTHX_ SV* const invlist)
8616 {
8617     /* Complement the input inversion list.  This adds a 0 if the list didn't
8618      * have a zero; removes it otherwise.  As described above, the data
8619      * structure is set up so that this is very efficient */
8620
8621     PERL_ARGS_ASSERT__INVLIST_INVERT;
8622
8623     assert(! invlist_is_iterating(invlist));
8624
8625     /* The inverse of matching nothing is matching everything */
8626     if (_invlist_len(invlist) == 0) {
8627         _append_range_to_invlist(invlist, 0, UV_MAX);
8628         return;
8629     }
8630
8631     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8632 }
8633
8634 #endif
8635
8636 PERL_STATIC_INLINE SV*
8637 S_invlist_clone(pTHX_ SV* const invlist)
8638 {
8639
8640     /* Return a new inversion list that is a copy of the input one, which is
8641      * unchanged.  The new list will not be mortal even if the old one was. */
8642
8643     /* Need to allocate extra space to accommodate Perl's addition of a
8644      * trailing NUL to SvPV's, since it thinks they are always strings */
8645     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8646     STRLEN physical_length = SvCUR(invlist);
8647     bool offset = *(get_invlist_offset_addr(invlist));
8648
8649     PERL_ARGS_ASSERT_INVLIST_CLONE;
8650
8651     *(get_invlist_offset_addr(new_invlist)) = offset;
8652     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8653     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8654
8655     return new_invlist;
8656 }
8657
8658 PERL_STATIC_INLINE STRLEN*
8659 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8660 {
8661     /* Return the address of the UV that contains the current iteration
8662      * position */
8663
8664     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8665
8666     assert(SvTYPE(invlist) == SVt_INVLIST);
8667
8668     return &(((XINVLIST*) SvANY(invlist))->iterator);
8669 }
8670
8671 PERL_STATIC_INLINE void
8672 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8673 {
8674     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8675
8676     *get_invlist_iter_addr(invlist) = 0;
8677 }
8678
8679 PERL_STATIC_INLINE void
8680 S_invlist_iterfinish(pTHX_ SV* invlist)
8681 {
8682     /* Terminate iterator for invlist.  This is to catch development errors.
8683      * Any iteration that is interrupted before completed should call this
8684      * function.  Functions that add code points anywhere else but to the end
8685      * of an inversion list assert that they are not in the middle of an
8686      * iteration.  If they were, the addition would make the iteration
8687      * problematical: if the iteration hadn't reached the place where things
8688      * were being added, it would be ok */
8689
8690     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8691
8692     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8693 }
8694
8695 STATIC bool
8696 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8697 {
8698     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8699      * This call sets in <*start> and <*end>, the next range in <invlist>.
8700      * Returns <TRUE> if successful and the next call will return the next
8701      * range; <FALSE> if was already at the end of the list.  If the latter,
8702      * <*start> and <*end> are unchanged, and the next call to this function
8703      * will start over at the beginning of the list */
8704
8705     STRLEN* pos = get_invlist_iter_addr(invlist);
8706     UV len = _invlist_len(invlist);
8707     UV *array;
8708
8709     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8710
8711     if (*pos >= len) {
8712         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8713         return FALSE;
8714     }
8715
8716     array = invlist_array(invlist);
8717
8718     *start = array[(*pos)++];
8719
8720     if (*pos >= len) {
8721         *end = UV_MAX;
8722     }
8723     else {
8724         *end = array[(*pos)++] - 1;
8725     }
8726
8727     return TRUE;
8728 }
8729
8730 PERL_STATIC_INLINE bool
8731 S_invlist_is_iterating(pTHX_ SV* const invlist)
8732 {
8733     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8734
8735     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8736 }
8737
8738 PERL_STATIC_INLINE UV
8739 S_invlist_highest(pTHX_ SV* const invlist)
8740 {
8741     /* Returns the highest code point that matches an inversion list.  This API
8742      * has an ambiguity, as it returns 0 under either the highest is actually
8743      * 0, or if the list is empty.  If this distinction matters to you, check
8744      * for emptiness before calling this function */
8745
8746     UV len = _invlist_len(invlist);
8747     UV *array;
8748
8749     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8750
8751     if (len == 0) {
8752         return 0;
8753     }
8754
8755     array = invlist_array(invlist);
8756
8757     /* The last element in the array in the inversion list always starts a
8758      * range that goes to infinity.  That range may be for code points that are
8759      * matched in the inversion list, or it may be for ones that aren't
8760      * matched.  In the latter case, the highest code point in the set is one
8761      * less than the beginning of this range; otherwise it is the final element
8762      * of this range: infinity */
8763     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8764            ? UV_MAX
8765            : array[len - 1] - 1;
8766 }
8767
8768 #ifndef PERL_IN_XSUB_RE
8769 SV *
8770 Perl__invlist_contents(pTHX_ SV* const invlist)
8771 {
8772     /* Get the contents of an inversion list into a string SV so that they can
8773      * be printed out.  It uses the format traditionally done for debug tracing
8774      */
8775
8776     UV start, end;
8777     SV* output = newSVpvs("\n");
8778
8779     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8780
8781     assert(! invlist_is_iterating(invlist));
8782
8783     invlist_iterinit(invlist);
8784     while (invlist_iternext(invlist, &start, &end)) {
8785         if (end == UV_MAX) {
8786             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8787         }
8788         else if (end != start) {
8789             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8790                     start,       end);
8791         }
8792         else {
8793             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8794         }
8795     }
8796
8797     return output;
8798 }
8799 #endif
8800
8801 #ifndef PERL_IN_XSUB_RE
8802 void
8803 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8804 {
8805     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8806      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8807      * the string 'indent'.  The output looks like this:
8808          [0] 0x000A .. 0x000D
8809          [2] 0x0085
8810          [4] 0x2028 .. 0x2029
8811          [6] 0x3104 .. INFINITY
8812      * This means that the first range of code points matched by the list are
8813      * 0xA through 0xD; the second range contains only the single code point
8814      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8815      * are used to define each range (except if the final range extends to
8816      * infinity, only a single element is needed).  The array index of the
8817      * first element for the corresponding range is given in brackets. */
8818
8819     UV start, end;
8820     STRLEN count = 0;
8821
8822     PERL_ARGS_ASSERT__INVLIST_DUMP;
8823
8824     if (invlist_is_iterating(invlist)) {
8825         Perl_dump_indent(aTHX_ level, file,
8826              "%sCan't dump inversion list because is in middle of iterating\n",
8827              indent);
8828         return;
8829     }
8830
8831     invlist_iterinit(invlist);
8832     while (invlist_iternext(invlist, &start, &end)) {
8833         if (end == UV_MAX) {
8834             Perl_dump_indent(aTHX_ level, file,
8835                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8836                                    indent, (UV)count, start);
8837         }
8838         else if (end != start) {
8839             Perl_dump_indent(aTHX_ level, file,
8840                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8841                                 indent, (UV)count, start,         end);
8842         }
8843         else {
8844             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8845                                             indent, (UV)count, start);
8846         }
8847         count += 2;
8848     }
8849 }
8850 #endif
8851
8852 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8853 bool
8854 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8855 {
8856     /* Return a boolean as to if the two passed in inversion lists are
8857      * identical.  The final argument, if TRUE, says to take the complement of
8858      * the second inversion list before doing the comparison */
8859
8860     const UV* array_a = invlist_array(a);
8861     const UV* array_b = invlist_array(b);
8862     UV len_a = _invlist_len(a);
8863     UV len_b = _invlist_len(b);
8864
8865     UV i = 0;               /* current index into the arrays */
8866     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8867
8868     PERL_ARGS_ASSERT__INVLISTEQ;
8869
8870     /* If are to compare 'a' with the complement of b, set it
8871      * up so are looking at b's complement. */
8872     if (complement_b) {
8873
8874         /* The complement of nothing is everything, so <a> would have to have
8875          * just one element, starting at zero (ending at infinity) */
8876         if (len_b == 0) {
8877             return (len_a == 1 && array_a[0] == 0);
8878         }
8879         else if (array_b[0] == 0) {
8880
8881             /* Otherwise, to complement, we invert.  Here, the first element is
8882              * 0, just remove it.  To do this, we just pretend the array starts
8883              * one later */
8884
8885             array_b++;
8886             len_b--;
8887         }
8888         else {
8889
8890             /* But if the first element is not zero, we pretend the list starts
8891              * at the 0 that is always stored immediately before the array. */
8892             array_b--;
8893             len_b++;
8894         }
8895     }
8896
8897     /* Make sure that the lengths are the same, as well as the final element
8898      * before looping through the remainder.  (Thus we test the length, final,
8899      * and first elements right off the bat) */
8900     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8901         retval = FALSE;
8902     }
8903     else for (i = 0; i < len_a - 1; i++) {
8904         if (array_a[i] != array_b[i]) {
8905             retval = FALSE;
8906             break;
8907         }
8908     }
8909
8910     return retval;
8911 }
8912 #endif
8913
8914 #undef HEADER_LENGTH
8915 #undef TO_INTERNAL_SIZE
8916 #undef FROM_INTERNAL_SIZE
8917 #undef INVLIST_VERSION_ID
8918
8919 /* End of inversion list object */
8920
8921 STATIC void
8922 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8923 {
8924     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8925      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8926      * should point to the first flag; it is updated on output to point to the
8927      * final ')' or ':'.  There needs to be at least one flag, or this will
8928      * abort */
8929
8930     /* for (?g), (?gc), and (?o) warnings; warning
8931        about (?c) will warn about (?g) -- japhy    */
8932
8933 #define WASTED_O  0x01
8934 #define WASTED_G  0x02
8935 #define WASTED_C  0x04
8936 #define WASTED_GC (WASTED_G|WASTED_C)
8937     I32 wastedflags = 0x00;
8938     U32 posflags = 0, negflags = 0;
8939     U32 *flagsp = &posflags;
8940     char has_charset_modifier = '\0';
8941     regex_charset cs;
8942     bool has_use_defaults = FALSE;
8943     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8944
8945     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8946
8947     /* '^' as an initial flag sets certain defaults */
8948     if (UCHARAT(RExC_parse) == '^') {
8949         RExC_parse++;
8950         has_use_defaults = TRUE;
8951         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8952         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8953                                         ? REGEX_UNICODE_CHARSET
8954                                         : REGEX_DEPENDS_CHARSET);
8955     }
8956
8957     cs = get_regex_charset(RExC_flags);
8958     if (cs == REGEX_DEPENDS_CHARSET
8959         && (RExC_utf8 || RExC_uni_semantics))
8960     {
8961         cs = REGEX_UNICODE_CHARSET;
8962     }
8963
8964     while (*RExC_parse) {
8965         /* && strchr("iogcmsx", *RExC_parse) */
8966         /* (?g), (?gc) and (?o) are useless here
8967            and must be globally applied -- japhy */
8968         switch (*RExC_parse) {
8969
8970             /* Code for the imsx flags */
8971             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8972
8973             case LOCALE_PAT_MOD:
8974                 if (has_charset_modifier) {
8975                     goto excess_modifier;
8976                 }
8977                 else if (flagsp == &negflags) {
8978                     goto neg_modifier;
8979                 }
8980                 cs = REGEX_LOCALE_CHARSET;
8981                 has_charset_modifier = LOCALE_PAT_MOD;
8982                 RExC_contains_locale = 1;
8983                 break;
8984             case UNICODE_PAT_MOD:
8985                 if (has_charset_modifier) {
8986                     goto excess_modifier;
8987                 }
8988                 else if (flagsp == &negflags) {
8989                     goto neg_modifier;
8990                 }
8991                 cs = REGEX_UNICODE_CHARSET;
8992                 has_charset_modifier = UNICODE_PAT_MOD;
8993                 break;
8994             case ASCII_RESTRICT_PAT_MOD:
8995                 if (flagsp == &negflags) {
8996                     goto neg_modifier;
8997                 }
8998                 if (has_charset_modifier) {
8999                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9000                         goto excess_modifier;
9001                     }
9002                     /* Doubled modifier implies more restricted */
9003                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9004                 }
9005                 else {
9006                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9007                 }
9008                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9009                 break;
9010             case DEPENDS_PAT_MOD:
9011                 if (has_use_defaults) {
9012                     goto fail_modifiers;
9013                 }
9014                 else if (flagsp == &negflags) {
9015                     goto neg_modifier;
9016                 }
9017                 else if (has_charset_modifier) {
9018                     goto excess_modifier;
9019                 }
9020
9021                 /* The dual charset means unicode semantics if the
9022                  * pattern (or target, not known until runtime) are
9023                  * utf8, or something in the pattern indicates unicode
9024                  * semantics */
9025                 cs = (RExC_utf8 || RExC_uni_semantics)
9026                      ? REGEX_UNICODE_CHARSET
9027                      : REGEX_DEPENDS_CHARSET;
9028                 has_charset_modifier = DEPENDS_PAT_MOD;
9029                 break;
9030             excess_modifier:
9031                 RExC_parse++;
9032                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9033                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9034                 }
9035                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9036                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9037                 }
9038                 else {
9039                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9040                 }
9041                 /*NOTREACHED*/
9042             neg_modifier:
9043                 RExC_parse++;
9044                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9045                 /*NOTREACHED*/
9046             case ONCE_PAT_MOD: /* 'o' */
9047             case GLOBAL_PAT_MOD: /* 'g' */
9048                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9049                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9050                     if (! (wastedflags & wflagbit) ) {
9051                         wastedflags |= wflagbit;
9052                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9053                         vWARN5(
9054                             RExC_parse + 1,
9055                             "Useless (%s%c) - %suse /%c modifier",
9056                             flagsp == &negflags ? "?-" : "?",
9057                             *RExC_parse,
9058                             flagsp == &negflags ? "don't " : "",
9059                             *RExC_parse
9060                         );
9061                     }
9062                 }
9063                 break;
9064
9065             case CONTINUE_PAT_MOD: /* 'c' */
9066                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9067                     if (! (wastedflags & WASTED_C) ) {
9068                         wastedflags |= WASTED_GC;
9069                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9070                         vWARN3(
9071                             RExC_parse + 1,
9072                             "Useless (%sc) - %suse /gc modifier",
9073                             flagsp == &negflags ? "?-" : "?",
9074                             flagsp == &negflags ? "don't " : ""
9075                         );
9076                     }
9077                 }
9078                 break;
9079             case KEEPCOPY_PAT_MOD: /* 'p' */
9080                 if (flagsp == &negflags) {
9081                     if (SIZE_ONLY)
9082                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9083                 } else {
9084                     *flagsp |= RXf_PMf_KEEPCOPY;
9085                 }
9086                 break;
9087             case '-':
9088                 /* A flag is a default iff it is following a minus, so
9089                  * if there is a minus, it means will be trying to
9090                  * re-specify a default which is an error */
9091                 if (has_use_defaults || flagsp == &negflags) {
9092                     goto fail_modifiers;
9093                 }
9094                 flagsp = &negflags;
9095                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9096                 break;
9097             case ':':
9098             case ')':
9099                 RExC_flags |= posflags;
9100                 RExC_flags &= ~negflags;
9101                 set_regex_charset(&RExC_flags, cs);
9102                 if (RExC_flags & RXf_PMf_FOLD) {
9103                     RExC_contains_i = 1;
9104                 }
9105                 return;
9106                 /*NOTREACHED*/
9107             default:
9108             fail_modifiers:
9109                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9110                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9111                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9112                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9113                 /*NOTREACHED*/
9114         }
9115
9116         ++RExC_parse;
9117     }
9118 }
9119
9120 /*
9121  - reg - regular expression, i.e. main body or parenthesized thing
9122  *
9123  * Caller must absorb opening parenthesis.
9124  *
9125  * Combining parenthesis handling with the base level of regular expression
9126  * is a trifle forced, but the need to tie the tails of the branches to what
9127  * follows makes it hard to avoid.
9128  */
9129 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9130 #ifdef DEBUGGING
9131 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9132 #else
9133 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9134 #endif
9135
9136 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9137    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9138    needs to be restarted.
9139    Otherwise would only return NULL if regbranch() returns NULL, which
9140    cannot happen.  */
9141 STATIC regnode *
9142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9143     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9144      * 2 is like 1, but indicates that nextchar() has been called to advance
9145      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9146      * this flag alerts us to the need to check for that */
9147 {
9148     dVAR;
9149     regnode *ret;               /* Will be the head of the group. */
9150     regnode *br;
9151     regnode *lastbr;
9152     regnode *ender = NULL;
9153     I32 parno = 0;
9154     I32 flags;
9155     U32 oregflags = RExC_flags;
9156     bool have_branch = 0;
9157     bool is_open = 0;
9158     I32 freeze_paren = 0;
9159     I32 after_freeze = 0;
9160
9161     char * parse_start = RExC_parse; /* MJD */
9162     char * const oregcomp_parse = RExC_parse;
9163
9164     GET_RE_DEBUG_FLAGS_DECL;
9165
9166     PERL_ARGS_ASSERT_REG;
9167     DEBUG_PARSE("reg ");
9168
9169     *flagp = 0;                         /* Tentatively. */
9170
9171
9172     /* Make an OPEN node, if parenthesized. */
9173     if (paren) {
9174
9175         /* Under /x, space and comments can be gobbled up between the '(' and
9176          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9177          * intervening space, as the sequence is a token, and a token should be
9178          * indivisible */
9179         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9180
9181         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9182             char *start_verb = RExC_parse;
9183             STRLEN verb_len = 0;
9184             char *start_arg = NULL;
9185             unsigned char op = 0;
9186             int argok = 1;
9187             int internal_argval = 0; /* internal_argval is only useful if !argok */
9188
9189             if (has_intervening_patws && SIZE_ONLY) {
9190                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9191             }
9192             while ( *RExC_parse && *RExC_parse != ')' ) {
9193                 if ( *RExC_parse == ':' ) {
9194                     start_arg = RExC_parse + 1;
9195                     break;
9196                 }
9197                 RExC_parse++;
9198             }
9199             ++start_verb;
9200             verb_len = RExC_parse - start_verb;
9201             if ( start_arg ) {
9202                 RExC_parse++;
9203                 while ( *RExC_parse && *RExC_parse != ')' )
9204                     RExC_parse++;
9205                 if ( *RExC_parse != ')' )
9206                     vFAIL("Unterminated verb pattern argument");
9207                 if ( RExC_parse == start_arg )
9208                     start_arg = NULL;
9209             } else {
9210                 if ( *RExC_parse != ')' )
9211                     vFAIL("Unterminated verb pattern");
9212             }
9213
9214             switch ( *start_verb ) {
9215             case 'A':  /* (*ACCEPT) */
9216                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9217                     op = ACCEPT;
9218                     internal_argval = RExC_nestroot;
9219                 }
9220                 break;
9221             case 'C':  /* (*COMMIT) */
9222                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9223                     op = COMMIT;
9224                 break;
9225             case 'F':  /* (*FAIL) */
9226                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9227                     op = OPFAIL;
9228                     argok = 0;
9229                 }
9230                 break;
9231             case ':':  /* (*:NAME) */
9232             case 'M':  /* (*MARK:NAME) */
9233                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9234                     op = MARKPOINT;
9235                     argok = -1;
9236                 }
9237                 break;
9238             case 'P':  /* (*PRUNE) */
9239                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9240                     op = PRUNE;
9241                 break;
9242             case 'S':   /* (*SKIP) */
9243                 if ( memEQs(start_verb,verb_len,"SKIP") )
9244                     op = SKIP;
9245                 break;
9246             case 'T':  /* (*THEN) */
9247                 /* [19:06] <TimToady> :: is then */
9248                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9249                     op = CUTGROUP;
9250                     RExC_seen |= REG_SEEN_CUTGROUP;
9251                 }
9252                 break;
9253             }
9254             if ( ! op ) {
9255                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9256                 vFAIL2utf8f(
9257                     "Unknown verb pattern '%"UTF8f"'",
9258                     UTF8fARG(UTF, verb_len, start_verb));
9259             }
9260             if ( argok ) {
9261                 if ( start_arg && internal_argval ) {
9262                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9263                         verb_len, start_verb);
9264                 } else if ( argok < 0 && !start_arg ) {
9265                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9266                         verb_len, start_verb);
9267                 } else {
9268                     ret = reganode(pRExC_state, op, internal_argval);
9269                     if ( ! internal_argval && ! SIZE_ONLY ) {
9270                         if (start_arg) {
9271                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9272                             ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9273                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9274                             ret->flags = 0;
9275                         } else {
9276                             ret->flags = 1;
9277                         }
9278                     }
9279                 }
9280                 if (!internal_argval)
9281                     RExC_seen |= REG_SEEN_VERBARG;
9282             } else if ( start_arg ) {
9283                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9284                         verb_len, start_verb);
9285             } else {
9286                 ret = reg_node(pRExC_state, op);
9287             }
9288             nextchar(pRExC_state);
9289             return ret;
9290         }
9291         else if (*RExC_parse == '?') { /* (?...) */
9292             bool is_logical = 0;
9293             const char * const seqstart = RExC_parse;
9294             if (has_intervening_patws && SIZE_ONLY) {
9295                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9296             }
9297
9298             RExC_parse++;
9299             paren = *RExC_parse++;
9300             ret = NULL;                 /* For look-ahead/behind. */
9301             switch (paren) {
9302
9303             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9304                 paren = *RExC_parse++;
9305                 if ( paren == '<')         /* (?P<...>) named capture */
9306                     goto named_capture;
9307                 else if (paren == '>') {   /* (?P>name) named recursion */
9308                     goto named_recursion;
9309                 }
9310                 else if (paren == '=') {   /* (?P=...)  named backref */
9311                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
9312                        you change this make sure you change that */
9313                     char* name_start = RExC_parse;
9314                     U32 num = 0;
9315                     SV *sv_dat = reg_scan_name(pRExC_state,
9316                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9317                     if (RExC_parse == name_start || *RExC_parse != ')')
9318                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9319                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9320
9321                     if (!SIZE_ONLY) {
9322                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9323                         RExC_rxi->data->data[num]=(void*)sv_dat;
9324                         SvREFCNT_inc_simple_void(sv_dat);
9325                     }
9326                     RExC_sawback = 1;
9327                     ret = reganode(pRExC_state,
9328                                    ((! FOLD)
9329                                      ? NREF
9330                                      : (ASCII_FOLD_RESTRICTED)
9331                                        ? NREFFA
9332                                        : (AT_LEAST_UNI_SEMANTICS)
9333                                          ? NREFFU
9334                                          : (LOC)
9335                                            ? NREFFL
9336                                            : NREFF),
9337                                     num);
9338                     *flagp |= HASWIDTH;
9339
9340                     Set_Node_Offset(ret, parse_start+1);
9341                     Set_Node_Cur_Length(ret, parse_start);
9342
9343                     nextchar(pRExC_state);
9344                     return ret;
9345                 }
9346                 RExC_parse++;
9347                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9348                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9349                 /*NOTREACHED*/
9350             case '<':           /* (?<...) */
9351                 if (*RExC_parse == '!')
9352                     paren = ',';
9353                 else if (*RExC_parse != '=')
9354               named_capture:
9355                 {               /* (?<...>) */
9356                     char *name_start;
9357                     SV *svname;
9358                     paren= '>';
9359             case '\'':          /* (?'...') */
9360                     name_start= RExC_parse;
9361                     svname = reg_scan_name(pRExC_state,
9362                         SIZE_ONLY    /* reverse test from the others */
9363                         ? REG_RSN_RETURN_NAME
9364                         : REG_RSN_RETURN_NULL);
9365                     if (RExC_parse == name_start || *RExC_parse != paren)
9366                         vFAIL2("Sequence (?%c... not terminated",
9367                             paren=='>' ? '<' : paren);
9368                     if (SIZE_ONLY) {
9369                         HE *he_str;
9370                         SV *sv_dat = NULL;
9371                         if (!svname) /* shouldn't happen */
9372                             Perl_croak(aTHX_
9373                                 "panic: reg_scan_name returned NULL");
9374                         if (!RExC_paren_names) {
9375                             RExC_paren_names= newHV();
9376                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9377 #ifdef DEBUGGING
9378                             RExC_paren_name_list= newAV();
9379                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9380 #endif
9381                         }
9382                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9383                         if ( he_str )
9384                             sv_dat = HeVAL(he_str);
9385                         if ( ! sv_dat ) {
9386                             /* croak baby croak */
9387                             Perl_croak(aTHX_
9388                                 "panic: paren_name hash element allocation failed");
9389                         } else if ( SvPOK(sv_dat) ) {
9390                             /* (?|...) can mean we have dupes so scan to check
9391                                its already been stored. Maybe a flag indicating
9392                                we are inside such a construct would be useful,
9393                                but the arrays are likely to be quite small, so
9394                                for now we punt -- dmq */
9395                             IV count = SvIV(sv_dat);
9396                             I32 *pv = (I32*)SvPVX(sv_dat);
9397                             IV i;
9398                             for ( i = 0 ; i < count ; i++ ) {
9399                                 if ( pv[i] == RExC_npar ) {
9400                                     count = 0;
9401                                     break;
9402                                 }
9403                             }
9404                             if ( count ) {
9405                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9406                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9407                                 pv[count] = RExC_npar;
9408                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9409                             }
9410                         } else {
9411                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9412                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9413                             SvIOK_on(sv_dat);
9414                             SvIV_set(sv_dat, 1);
9415                         }
9416 #ifdef DEBUGGING
9417                         /* Yes this does cause a memory leak in debugging Perls */
9418                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9419                             SvREFCNT_dec_NN(svname);
9420 #endif
9421
9422                         /*sv_dump(sv_dat);*/
9423                     }
9424                     nextchar(pRExC_state);
9425                     paren = 1;
9426                     goto capturing_parens;
9427                 }
9428                 RExC_seen |= REG_SEEN_LOOKBEHIND;
9429                 RExC_in_lookbehind++;
9430                 RExC_parse++;
9431             case '=':           /* (?=...) */
9432                 RExC_seen_zerolen++;
9433                 break;
9434             case '!':           /* (?!...) */
9435                 RExC_seen_zerolen++;
9436                 if (*RExC_parse == ')') {
9437                     ret=reg_node(pRExC_state, OPFAIL);
9438                     nextchar(pRExC_state);
9439                     return ret;
9440                 }
9441                 break;
9442             case '|':           /* (?|...) */
9443                 /* branch reset, behave like a (?:...) except that
9444                    buffers in alternations share the same numbers */
9445                 paren = ':';
9446                 after_freeze = freeze_paren = RExC_npar;
9447                 break;
9448             case ':':           /* (?:...) */
9449             case '>':           /* (?>...) */
9450                 break;
9451             case '$':           /* (?$...) */
9452             case '@':           /* (?@...) */
9453                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9454                 break;
9455             case '#':           /* (?#...) */
9456                 /* XXX As soon as we disallow separating the '?' and '*' (by
9457                  * spaces or (?#...) comment), it is believed that this case
9458                  * will be unreachable and can be removed.  See
9459                  * [perl #117327] */
9460                 while (*RExC_parse && *RExC_parse != ')')
9461                     RExC_parse++;
9462                 if (*RExC_parse != ')')
9463                     FAIL("Sequence (?#... not terminated");
9464                 nextchar(pRExC_state);
9465                 *flagp = TRYAGAIN;
9466                 return NULL;
9467             case '0' :           /* (?0) */
9468             case 'R' :           /* (?R) */
9469                 if (*RExC_parse != ')')
9470                     FAIL("Sequence (?R) not terminated");
9471                 ret = reg_node(pRExC_state, GOSTART);
9472                     RExC_seen |= REG_SEEN_GOSTART;
9473                 *flagp |= POSTPONED;
9474                 nextchar(pRExC_state);
9475                 return ret;
9476                 /*notreached*/
9477             { /* named and numeric backreferences */
9478                 I32 num;
9479             case '&':            /* (?&NAME) */
9480                 parse_start = RExC_parse - 1;
9481               named_recursion:
9482                 {
9483                     SV *sv_dat = reg_scan_name(pRExC_state,
9484                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9485                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9486                 }
9487                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9488                     vFAIL("Sequence (?&... not terminated");
9489                 goto gen_recurse_regop;
9490                 assert(0); /* NOT REACHED */
9491             case '+':
9492                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9493                     RExC_parse++;
9494                     vFAIL("Illegal pattern");
9495                 }
9496                 goto parse_recursion;
9497                 /* NOT REACHED*/
9498             case '-': /* (?-1) */
9499                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9500                     RExC_parse--; /* rewind to let it be handled later */
9501                     goto parse_flags;
9502                 }
9503                 /*FALLTHROUGH */
9504             case '1': case '2': case '3': case '4': /* (?1) */
9505             case '5': case '6': case '7': case '8': case '9':
9506                 RExC_parse--;
9507               parse_recursion:
9508                 num = atoi(RExC_parse);
9509                 parse_start = RExC_parse - 1; /* MJD */
9510                 if (*RExC_parse == '-')
9511                     RExC_parse++;
9512                 while (isDIGIT(*RExC_parse))
9513                         RExC_parse++;
9514                 if (*RExC_parse!=')')
9515                     vFAIL("Expecting close bracket");
9516
9517               gen_recurse_regop:
9518                 if ( paren == '-' ) {
9519                     /*
9520                     Diagram of capture buffer numbering.
9521                     Top line is the normal capture buffer numbers
9522                     Bottom line is the negative indexing as from
9523                     the X (the (?-2))
9524
9525                     +   1 2    3 4 5 X          6 7
9526                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9527                     -   5 4    3 2 1 X          x x
9528
9529                     */
9530                     num = RExC_npar + num;
9531                     if (num < 1)  {
9532                         RExC_parse++;
9533                         vFAIL("Reference to nonexistent group");
9534                     }
9535                 } else if ( paren == '+' ) {
9536                     num = RExC_npar + num - 1;
9537                 }
9538
9539                 ret = reganode(pRExC_state, GOSUB, num);
9540                 if (!SIZE_ONLY) {
9541                     if (num > (I32)RExC_rx->nparens) {
9542                         RExC_parse++;
9543                         vFAIL("Reference to nonexistent group");
9544                     }
9545                     ARG2L_SET( ret, RExC_recurse_count++);
9546                     RExC_emit++;
9547                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9548                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9549                 } else {
9550                     RExC_size++;
9551                 }
9552                 RExC_seen |= REG_SEEN_RECURSE;
9553                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9554                 Set_Node_Offset(ret, parse_start); /* MJD */
9555
9556                 *flagp |= POSTPONED;
9557                 nextchar(pRExC_state);
9558                 return ret;
9559             } /* named and numeric backreferences */
9560             assert(0); /* NOT REACHED */
9561
9562             case '?':           /* (??...) */
9563                 is_logical = 1;
9564                 if (*RExC_parse != '{') {
9565                     RExC_parse++;
9566                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9567                     vFAIL2utf8f(
9568                         "Sequence (%"UTF8f"...) not recognized",
9569                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9570                     /*NOTREACHED*/
9571                 }
9572                 *flagp |= POSTPONED;
9573                 paren = *RExC_parse++;
9574                 /* FALL THROUGH */
9575             case '{':           /* (?{...}) */
9576             {
9577                 U32 n = 0;
9578                 struct reg_code_block *cb;
9579
9580                 RExC_seen_zerolen++;
9581
9582                 if (   !pRExC_state->num_code_blocks
9583                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9584                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9585                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9586                             - RExC_start)
9587                 ) {
9588                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9589                         FAIL("panic: Sequence (?{...}): no code block found\n");
9590                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9591                 }
9592                 /* this is a pre-compiled code block (?{...}) */
9593                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9594                 RExC_parse = RExC_start + cb->end;
9595                 if (!SIZE_ONLY) {
9596                     OP *o = cb->block;
9597                     if (cb->src_regex) {
9598                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9599                         RExC_rxi->data->data[n] =
9600                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9601                         RExC_rxi->data->data[n+1] = (void*)o;
9602                     }
9603                     else {
9604                         n = add_data(pRExC_state,
9605                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9606                         RExC_rxi->data->data[n] = (void*)o;
9607                     }
9608                 }
9609                 pRExC_state->code_index++;
9610                 nextchar(pRExC_state);
9611
9612                 if (is_logical) {
9613                     regnode *eval;
9614                     ret = reg_node(pRExC_state, LOGICAL);
9615                     eval = reganode(pRExC_state, EVAL, n);
9616                     if (!SIZE_ONLY) {
9617                         ret->flags = 2;
9618                         /* for later propagation into (??{}) return value */
9619                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9620                     }
9621                     REGTAIL(pRExC_state, ret, eval);
9622                     /* deal with the length of this later - MJD */
9623                     return ret;
9624                 }
9625                 ret = reganode(pRExC_state, EVAL, n);
9626                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9627                 Set_Node_Offset(ret, parse_start);
9628                 return ret;
9629             }
9630             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9631             {
9632                 int is_define= 0;
9633                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9634                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9635                         || RExC_parse[1] == '<'
9636                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9637                         I32 flag;
9638                         regnode *tail;
9639
9640                         ret = reg_node(pRExC_state, LOGICAL);
9641                         if (!SIZE_ONLY)
9642                             ret->flags = 1;
9643
9644                         tail = reg(pRExC_state, 1, &flag, depth+1);
9645                         if (flag & RESTART_UTF8) {
9646                             *flagp = RESTART_UTF8;
9647                             return NULL;
9648                         }
9649                         REGTAIL(pRExC_state, ret, tail);
9650                         goto insert_if;
9651                     }
9652                 }
9653                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9654                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9655                 {
9656                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9657                     char *name_start= RExC_parse++;
9658                     U32 num = 0;
9659                     SV *sv_dat=reg_scan_name(pRExC_state,
9660                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9661                     if (RExC_parse == name_start || *RExC_parse != ch)
9662                         vFAIL2("Sequence (?(%c... not terminated",
9663                             (ch == '>' ? '<' : ch));
9664                     RExC_parse++;
9665                     if (!SIZE_ONLY) {
9666                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9667                         RExC_rxi->data->data[num]=(void*)sv_dat;
9668                         SvREFCNT_inc_simple_void(sv_dat);
9669                     }
9670                     ret = reganode(pRExC_state,NGROUPP,num);
9671                     goto insert_if_check_paren;
9672                 }
9673                 else if (RExC_parse[0] == 'D' &&
9674                          RExC_parse[1] == 'E' &&
9675                          RExC_parse[2] == 'F' &&
9676                          RExC_parse[3] == 'I' &&
9677                          RExC_parse[4] == 'N' &&
9678                          RExC_parse[5] == 'E')
9679                 {
9680                     ret = reganode(pRExC_state,DEFINEP,0);
9681                     RExC_parse +=6 ;
9682                     is_define = 1;
9683                     goto insert_if_check_paren;
9684                 }
9685                 else if (RExC_parse[0] == 'R') {
9686                     RExC_parse++;
9687                     parno = 0;
9688                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9689                         parno = atoi(RExC_parse++);
9690                         while (isDIGIT(*RExC_parse))
9691                             RExC_parse++;
9692                     } else if (RExC_parse[0] == '&') {
9693                         SV *sv_dat;
9694                         RExC_parse++;
9695                         sv_dat = reg_scan_name(pRExC_state,
9696                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9697                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9698                     }
9699                     ret = reganode(pRExC_state,INSUBP,parno);
9700                     goto insert_if_check_paren;
9701                 }
9702                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9703                     /* (?(1)...) */
9704                     char c;
9705                     char *tmp;
9706                     parno = atoi(RExC_parse++);
9707
9708                     while (isDIGIT(*RExC_parse))
9709                         RExC_parse++;
9710                     ret = reganode(pRExC_state, GROUPP, parno);
9711
9712                  insert_if_check_paren:
9713                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9714                         /* nextchar also skips comments, so undo its work
9715                          * and skip over the the next character.
9716                          */
9717                         RExC_parse = tmp;
9718                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9719                         vFAIL("Switch condition not recognized");
9720                     }
9721                   insert_if:
9722                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9723                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9724                     if (br == NULL) {
9725                         if (flags & RESTART_UTF8) {
9726                             *flagp = RESTART_UTF8;
9727                             return NULL;
9728                         }
9729                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9730                               (UV) flags);
9731                     } else
9732                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9733                     c = *nextchar(pRExC_state);
9734                     if (flags&HASWIDTH)
9735                         *flagp |= HASWIDTH;
9736                     if (c == '|') {
9737                         if (is_define)
9738                             vFAIL("(?(DEFINE)....) does not allow branches");
9739                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9740                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9741                             if (flags & RESTART_UTF8) {
9742                                 *flagp = RESTART_UTF8;
9743                                 return NULL;
9744                             }
9745                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9746                                   (UV) flags);
9747                         }
9748                         REGTAIL(pRExC_state, ret, lastbr);
9749                         if (flags&HASWIDTH)
9750                             *flagp |= HASWIDTH;
9751                         c = *nextchar(pRExC_state);
9752                     }
9753                     else
9754                         lastbr = NULL;
9755                     if (c != ')')
9756                         vFAIL("Switch (?(condition)... contains too many branches");
9757                     ender = reg_node(pRExC_state, TAIL);
9758                     REGTAIL(pRExC_state, br, ender);
9759                     if (lastbr) {
9760                         REGTAIL(pRExC_state, lastbr, ender);
9761                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9762                     }
9763                     else
9764                         REGTAIL(pRExC_state, ret, ender);
9765                     RExC_size++; /* XXX WHY do we need this?!!
9766                                     For large programs it seems to be required
9767                                     but I can't figure out why. -- dmq*/
9768                     return ret;
9769                 }
9770                 else {
9771                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9772                     vFAIL("Unknown switch condition (?(...))");
9773                 }
9774             }
9775             case '[':           /* (?[ ... ]) */
9776                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9777                                          oregcomp_parse);
9778             case 0:
9779                 RExC_parse--; /* for vFAIL to print correctly */
9780                 vFAIL("Sequence (? incomplete");
9781                 break;
9782             default: /* e.g., (?i) */
9783                 --RExC_parse;
9784               parse_flags:
9785                 parse_lparen_question_flags(pRExC_state);
9786                 if (UCHARAT(RExC_parse) != ':') {
9787                     nextchar(pRExC_state);
9788                     *flagp = TRYAGAIN;
9789                     return NULL;
9790                 }
9791                 paren = ':';
9792                 nextchar(pRExC_state);
9793                 ret = NULL;
9794                 goto parse_rest;
9795             } /* end switch */
9796         }
9797         else {                  /* (...) */
9798           capturing_parens:
9799             parno = RExC_npar;
9800             RExC_npar++;
9801
9802             ret = reganode(pRExC_state, OPEN, parno);
9803             if (!SIZE_ONLY ){
9804                 if (!RExC_nestroot)
9805                     RExC_nestroot = parno;
9806                 if (RExC_seen & REG_SEEN_RECURSE
9807                     && !RExC_open_parens[parno-1])
9808                 {
9809                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9810                         "Setting open paren #%"IVdf" to %d\n",
9811                         (IV)parno, REG_NODE_NUM(ret)));
9812                     RExC_open_parens[parno-1]= ret;
9813                 }
9814             }
9815             Set_Node_Length(ret, 1); /* MJD */
9816             Set_Node_Offset(ret, RExC_parse); /* MJD */
9817             is_open = 1;
9818         }
9819     }
9820     else                        /* ! paren */
9821         ret = NULL;
9822
9823    parse_rest:
9824     /* Pick up the branches, linking them together. */
9825     parse_start = RExC_parse;   /* MJD */
9826     br = regbranch(pRExC_state, &flags, 1,depth+1);
9827
9828     /*     branch_len = (paren != 0); */
9829
9830     if (br == NULL) {
9831         if (flags & RESTART_UTF8) {
9832             *flagp = RESTART_UTF8;
9833             return NULL;
9834         }
9835         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9836     }
9837     if (*RExC_parse == '|') {
9838         if (!SIZE_ONLY && RExC_extralen) {
9839             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9840         }
9841         else {                  /* MJD */
9842             reginsert(pRExC_state, BRANCH, br, depth+1);
9843             Set_Node_Length(br, paren != 0);
9844             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9845         }
9846         have_branch = 1;
9847         if (SIZE_ONLY)
9848             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9849     }
9850     else if (paren == ':') {
9851         *flagp |= flags&SIMPLE;
9852     }
9853     if (is_open) {                              /* Starts with OPEN. */
9854         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9855     }
9856     else if (paren != '?')              /* Not Conditional */
9857         ret = br;
9858     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9859     lastbr = br;
9860     while (*RExC_parse == '|') {
9861         if (!SIZE_ONLY && RExC_extralen) {
9862             ender = reganode(pRExC_state, LONGJMP,0);
9863             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9864         }
9865         if (SIZE_ONLY)
9866             RExC_extralen += 2;         /* Account for LONGJMP. */
9867         nextchar(pRExC_state);
9868         if (freeze_paren) {
9869             if (RExC_npar > after_freeze)
9870                 after_freeze = RExC_npar;
9871             RExC_npar = freeze_paren;
9872         }
9873         br = regbranch(pRExC_state, &flags, 0, depth+1);
9874
9875         if (br == NULL) {
9876             if (flags & RESTART_UTF8) {
9877                 *flagp = RESTART_UTF8;
9878                 return NULL;
9879             }
9880             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9881         }
9882         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9883         lastbr = br;
9884         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9885     }
9886
9887     if (have_branch || paren != ':') {
9888         /* Make a closing node, and hook it on the end. */
9889         switch (paren) {
9890         case ':':
9891             ender = reg_node(pRExC_state, TAIL);
9892             break;
9893         case 1: case 2:
9894             ender = reganode(pRExC_state, CLOSE, parno);
9895             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9896                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9897                         "Setting close paren #%"IVdf" to %d\n",
9898                         (IV)parno, REG_NODE_NUM(ender)));
9899                 RExC_close_parens[parno-1]= ender;
9900                 if (RExC_nestroot == parno)
9901                     RExC_nestroot = 0;
9902             }
9903             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9904             Set_Node_Length(ender,1); /* MJD */
9905             break;
9906         case '<':
9907         case ',':
9908         case '=':
9909         case '!':
9910             *flagp &= ~HASWIDTH;
9911             /* FALL THROUGH */
9912         case '>':
9913             ender = reg_node(pRExC_state, SUCCEED);
9914             break;
9915         case 0:
9916             ender = reg_node(pRExC_state, END);
9917             if (!SIZE_ONLY) {
9918                 assert(!RExC_opend); /* there can only be one! */
9919                 RExC_opend = ender;
9920             }
9921             break;
9922         }
9923         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9924             SV * const mysv_val1=sv_newmortal();
9925             SV * const mysv_val2=sv_newmortal();
9926             DEBUG_PARSE_MSG("lsbr");
9927             regprop(RExC_rx, mysv_val1, lastbr);
9928             regprop(RExC_rx, mysv_val2, ender);
9929             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9930                           SvPV_nolen_const(mysv_val1),
9931                           (IV)REG_NODE_NUM(lastbr),
9932                           SvPV_nolen_const(mysv_val2),
9933                           (IV)REG_NODE_NUM(ender),
9934                           (IV)(ender - lastbr)
9935             );
9936         });
9937         REGTAIL(pRExC_state, lastbr, ender);
9938
9939         if (have_branch && !SIZE_ONLY) {
9940             char is_nothing= 1;
9941             if (depth==1)
9942                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9943
9944             /* Hook the tails of the branches to the closing node. */
9945             for (br = ret; br; br = regnext(br)) {
9946                 const U8 op = PL_regkind[OP(br)];
9947                 if (op == BRANCH) {
9948                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9949                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9950                         is_nothing= 0;
9951                 }
9952                 else if (op == BRANCHJ) {
9953                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9954                     /* for now we always disable this optimisation * /
9955                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9956                     */
9957                         is_nothing= 0;
9958                 }
9959             }
9960             if (is_nothing) {
9961                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9962                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9963                     SV * const mysv_val1=sv_newmortal();
9964                     SV * const mysv_val2=sv_newmortal();
9965                     DEBUG_PARSE_MSG("NADA");
9966                     regprop(RExC_rx, mysv_val1, ret);
9967                     regprop(RExC_rx, mysv_val2, ender);
9968                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9969                                   SvPV_nolen_const(mysv_val1),
9970                                   (IV)REG_NODE_NUM(ret),
9971                                   SvPV_nolen_const(mysv_val2),
9972                                   (IV)REG_NODE_NUM(ender),
9973                                   (IV)(ender - ret)
9974                     );
9975                 });
9976                 OP(br)= NOTHING;
9977                 if (OP(ender) == TAIL) {
9978                     NEXT_OFF(br)= 0;
9979                     RExC_emit= br + 1;
9980                 } else {
9981                     regnode *opt;
9982                     for ( opt= br + 1; opt < ender ; opt++ )
9983                         OP(opt)= OPTIMIZED;
9984                     NEXT_OFF(br)= ender - br;
9985                 }
9986             }
9987         }
9988     }
9989
9990     {
9991         const char *p;
9992         static const char parens[] = "=!<,>";
9993
9994         if (paren && (p = strchr(parens, paren))) {
9995             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9996             int flag = (p - parens) > 1;
9997
9998             if (paren == '>')
9999                 node = SUSPEND, flag = 0;
10000             reginsert(pRExC_state, node,ret, depth+1);
10001             Set_Node_Cur_Length(ret, parse_start);
10002             Set_Node_Offset(ret, parse_start + 1);
10003             ret->flags = flag;
10004             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10005         }
10006     }
10007
10008     /* Check for proper termination. */
10009     if (paren) {
10010         /* restore original flags, but keep (?p) */
10011         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10012         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10013             RExC_parse = oregcomp_parse;
10014             vFAIL("Unmatched (");
10015         }
10016     }
10017     else if (!paren && RExC_parse < RExC_end) {
10018         if (*RExC_parse == ')') {
10019             RExC_parse++;
10020             vFAIL("Unmatched )");
10021         }
10022         else
10023             FAIL("Junk on end of regexp");      /* "Can't happen". */
10024         assert(0); /* NOTREACHED */
10025     }
10026
10027     if (RExC_in_lookbehind) {
10028         RExC_in_lookbehind--;
10029     }
10030     if (after_freeze > RExC_npar)
10031         RExC_npar = after_freeze;
10032     return(ret);
10033 }
10034
10035 /*
10036  - regbranch - one alternative of an | operator
10037  *
10038  * Implements the concatenation operator.
10039  *
10040  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10041  * restarted.
10042  */
10043 STATIC regnode *
10044 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10045 {
10046     dVAR;
10047     regnode *ret;
10048     regnode *chain = NULL;
10049     regnode *latest;
10050     I32 flags = 0, c = 0;
10051     GET_RE_DEBUG_FLAGS_DECL;
10052
10053     PERL_ARGS_ASSERT_REGBRANCH;
10054
10055     DEBUG_PARSE("brnc");
10056
10057     if (first)
10058         ret = NULL;
10059     else {
10060         if (!SIZE_ONLY && RExC_extralen)
10061             ret = reganode(pRExC_state, BRANCHJ,0);
10062         else {
10063             ret = reg_node(pRExC_state, BRANCH);
10064             Set_Node_Length(ret, 1);
10065         }
10066     }
10067
10068     if (!first && SIZE_ONLY)
10069         RExC_extralen += 1;                     /* BRANCHJ */
10070
10071     *flagp = WORST;                     /* Tentatively. */
10072
10073     RExC_parse--;
10074     nextchar(pRExC_state);
10075     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10076         flags &= ~TRYAGAIN;
10077         latest = regpiece(pRExC_state, &flags,depth+1);
10078         if (latest == NULL) {
10079             if (flags & TRYAGAIN)
10080                 continue;
10081             if (flags & RESTART_UTF8) {
10082                 *flagp = RESTART_UTF8;
10083                 return NULL;
10084             }
10085             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10086         }
10087         else if (ret == NULL)
10088             ret = latest;
10089         *flagp |= flags&(HASWIDTH|POSTPONED);
10090         if (chain == NULL)      /* First piece. */
10091             *flagp |= flags&SPSTART;
10092         else {
10093             RExC_naughty++;
10094             REGTAIL(pRExC_state, chain, latest);
10095         }
10096         chain = latest;
10097         c++;
10098     }
10099     if (chain == NULL) {        /* Loop ran zero times. */
10100         chain = reg_node(pRExC_state, NOTHING);
10101         if (ret == NULL)
10102             ret = chain;
10103     }
10104     if (c == 1) {
10105         *flagp |= flags&SIMPLE;
10106     }
10107
10108     return ret;
10109 }
10110
10111 /*
10112  - regpiece - something followed by possible [*+?]
10113  *
10114  * Note that the branching code sequences used for ? and the general cases
10115  * of * and + are somewhat optimized:  they use the same NOTHING node as
10116  * both the endmarker for their branch list and the body of the last branch.
10117  * It might seem that this node could be dispensed with entirely, but the
10118  * endmarker role is not redundant.
10119  *
10120  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10121  * TRYAGAIN.
10122  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10123  * restarted.
10124  */
10125 STATIC regnode *
10126 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10127 {
10128     dVAR;
10129     regnode *ret;
10130     char op;
10131     char *next;
10132     I32 flags;
10133     const char * const origparse = RExC_parse;
10134     I32 min;
10135     I32 max = REG_INFTY;
10136 #ifdef RE_TRACK_PATTERN_OFFSETS
10137     char *parse_start;
10138 #endif
10139     const char *maxpos = NULL;
10140
10141     /* Save the original in case we change the emitted regop to a FAIL. */
10142     regnode * const orig_emit = RExC_emit;
10143
10144     GET_RE_DEBUG_FLAGS_DECL;
10145
10146     PERL_ARGS_ASSERT_REGPIECE;
10147
10148     DEBUG_PARSE("piec");
10149
10150     ret = regatom(pRExC_state, &flags,depth+1);
10151     if (ret == NULL) {
10152         if (flags & (TRYAGAIN|RESTART_UTF8))
10153             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10154         else
10155             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10156         return(NULL);
10157     }
10158
10159     op = *RExC_parse;
10160
10161     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10162         maxpos = NULL;
10163 #ifdef RE_TRACK_PATTERN_OFFSETS
10164         parse_start = RExC_parse; /* MJD */
10165 #endif
10166         next = RExC_parse + 1;
10167         while (isDIGIT(*next) || *next == ',') {
10168             if (*next == ',') {
10169                 if (maxpos)
10170                     break;
10171                 else
10172                     maxpos = next;
10173             }
10174             next++;
10175         }
10176         if (*next == '}') {             /* got one */
10177             if (!maxpos)
10178                 maxpos = next;
10179             RExC_parse++;
10180             min = atoi(RExC_parse);
10181             if (*maxpos == ',')
10182                 maxpos++;
10183             else
10184                 maxpos = RExC_parse;
10185             max = atoi(maxpos);
10186             if (!max && *maxpos != '0')
10187                 max = REG_INFTY;                /* meaning "infinity" */
10188             else if (max >= REG_INFTY)
10189                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10190             RExC_parse = next;
10191             nextchar(pRExC_state);
10192             if (max < min) {    /* If can't match, warn and optimize to fail
10193                                    unconditionally */
10194                 if (SIZE_ONLY) {
10195                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10196
10197                     /* We can't back off the size because we have to reserve
10198                      * enough space for all the things we are about to throw
10199                      * away, but we can shrink it by the ammount we are about
10200                      * to re-use here */
10201                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10202                 }
10203                 else {
10204                     RExC_emit = orig_emit;
10205                 }
10206                 ret = reg_node(pRExC_state, OPFAIL);
10207                 return ret;
10208             }
10209             else if (min == max
10210                      && RExC_parse < RExC_end
10211                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10212             {
10213                 if (SIZE_ONLY) {
10214                     ckWARN2reg(RExC_parse + 1,
10215                                "Useless use of greediness modifier '%c'",
10216                                *RExC_parse);
10217                 }
10218                 /* Absorb the modifier, so later code doesn't see nor use
10219                     * it */
10220                 nextchar(pRExC_state);
10221             }
10222
10223         do_curly:
10224             if ((flags&SIMPLE)) {
10225                 RExC_naughty += 2 + RExC_naughty / 2;
10226                 reginsert(pRExC_state, CURLY, ret, depth+1);
10227                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10228                 Set_Node_Cur_Length(ret, parse_start);
10229             }
10230             else {
10231                 regnode * const w = reg_node(pRExC_state, WHILEM);
10232
10233                 w->flags = 0;
10234                 REGTAIL(pRExC_state, ret, w);
10235                 if (!SIZE_ONLY && RExC_extralen) {
10236                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10237                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10238                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10239                 }
10240                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10241                                 /* MJD hk */
10242                 Set_Node_Offset(ret, parse_start+1);
10243                 Set_Node_Length(ret,
10244                                 op == '{' ? (RExC_parse - parse_start) : 1);
10245
10246                 if (!SIZE_ONLY && RExC_extralen)
10247                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10248                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10249                 if (SIZE_ONLY)
10250                     RExC_whilem_seen++, RExC_extralen += 3;
10251                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10252             }
10253             ret->flags = 0;
10254
10255             if (min > 0)
10256                 *flagp = WORST;
10257             if (max > 0)
10258                 *flagp |= HASWIDTH;
10259             if (!SIZE_ONLY) {
10260                 ARG1_SET(ret, (U16)min);
10261                 ARG2_SET(ret, (U16)max);
10262             }
10263
10264             goto nest_check;
10265         }
10266     }
10267
10268     if (!ISMULT1(op)) {
10269         *flagp = flags;
10270         return(ret);
10271     }
10272
10273 #if 0                           /* Now runtime fix should be reliable. */
10274
10275     /* if this is reinstated, don't forget to put this back into perldiag:
10276
10277             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10278
10279            (F) The part of the regexp subject to either the * or + quantifier
10280            could match an empty string. The {#} shows in the regular
10281            expression about where the problem was discovered.
10282
10283     */
10284
10285     if (!(flags&HASWIDTH) && op != '?')
10286       vFAIL("Regexp *+ operand could be empty");
10287 #endif
10288
10289 #ifdef RE_TRACK_PATTERN_OFFSETS
10290     parse_start = RExC_parse;
10291 #endif
10292     nextchar(pRExC_state);
10293
10294     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10295
10296     if (op == '*' && (flags&SIMPLE)) {
10297         reginsert(pRExC_state, STAR, ret, depth+1);
10298         ret->flags = 0;
10299         RExC_naughty += 4;
10300     }
10301     else if (op == '*') {
10302         min = 0;
10303         goto do_curly;
10304     }
10305     else if (op == '+' && (flags&SIMPLE)) {
10306         reginsert(pRExC_state, PLUS, ret, depth+1);
10307         ret->flags = 0;
10308         RExC_naughty += 3;
10309     }
10310     else if (op == '+') {
10311         min = 1;
10312         goto do_curly;
10313     }
10314     else if (op == '?') {
10315         min = 0; max = 1;
10316         goto do_curly;
10317     }
10318   nest_check:
10319     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10320         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10321         ckWARN2reg(RExC_parse,
10322                    "%"UTF8f" matches null string many times",
10323                    UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10324                    origparse));
10325         (void)ReREFCNT_inc(RExC_rx_sv);
10326     }
10327
10328     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10329         nextchar(pRExC_state);
10330         reginsert(pRExC_state, MINMOD, ret, depth+1);
10331         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10332     }
10333     else
10334     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10335         regnode *ender;
10336         nextchar(pRExC_state);
10337         ender = reg_node(pRExC_state, SUCCEED);
10338         REGTAIL(pRExC_state, ret, ender);
10339         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10340         ret->flags = 0;
10341         ender = reg_node(pRExC_state, TAIL);
10342         REGTAIL(pRExC_state, ret, ender);
10343     }
10344
10345     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10346         RExC_parse++;
10347         vFAIL("Nested quantifiers");
10348     }
10349
10350     return(ret);
10351 }
10352
10353 STATIC bool
10354 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10355         const bool strict   /* Apply stricter parsing rules? */
10356     )
10357 {
10358
10359  /* This is expected to be called by a parser routine that has recognized '\N'
10360    and needs to handle the rest. RExC_parse is expected to point at the first
10361    char following the N at the time of the call.  On successful return,
10362    RExC_parse has been updated to point to just after the sequence identified
10363    by this routine, and <*flagp> has been updated.
10364
10365    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10366    character class.
10367
10368    \N may begin either a named sequence, or if outside a character class, mean
10369    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10370    attempted to decide which, and in the case of a named sequence, converted it
10371    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10372    where c1... are the characters in the sequence.  For single-quoted regexes,
10373    the tokenizer passes the \N sequence through unchanged; this code will not
10374    attempt to determine this nor expand those, instead raising a syntax error.
10375    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10376    or there is no '}', it signals that this \N occurrence means to match a
10377    non-newline.
10378
10379    Only the \N{U+...} form should occur in a character class, for the same
10380    reason that '.' inside a character class means to just match a period: it
10381    just doesn't make sense.
10382
10383    The function raises an error (via vFAIL), and doesn't return for various
10384    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10385    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10386    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10387    only possible if node_p is non-NULL.
10388
10389
10390    If <valuep> is non-null, it means the caller can accept an input sequence
10391    consisting of a just a single code point; <*valuep> is set to that value
10392    if the input is such.
10393
10394    If <node_p> is non-null it signifies that the caller can accept any other
10395    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10396    is set as follows:
10397     1) \N means not-a-NL: points to a newly created REG_ANY node;
10398     2) \N{}:              points to a new NOTHING node;
10399     3) otherwise:         points to a new EXACT node containing the resolved
10400                           string.
10401    Note that FALSE is returned for single code point sequences if <valuep> is
10402    null.
10403  */
10404
10405     char * endbrace;    /* '}' following the name */
10406     char* p;
10407     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10408                            stream */
10409     bool has_multiple_chars; /* true if the input stream contains a sequence of
10410                                 more than one character */
10411
10412     GET_RE_DEBUG_FLAGS_DECL;
10413
10414     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10415
10416     GET_RE_DEBUG_FLAGS;
10417
10418     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10419
10420     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10421      * modifier.  The other meaning does not, so use a temporary until we find
10422      * out which we are being called with */
10423     p = (RExC_flags & RXf_PMf_EXTENDED)
10424         ? regwhite( pRExC_state, RExC_parse )
10425         : RExC_parse;
10426
10427     /* Disambiguate between \N meaning a named character versus \N meaning
10428      * [^\n].  The former is assumed when it can't be the latter. */
10429     if (*p != '{' || regcurly(p, FALSE)) {
10430         RExC_parse = p;
10431         if (! node_p) {
10432             /* no bare \N allowed in a charclass */
10433             if (in_char_class) {
10434                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10435             }
10436             return FALSE;
10437         }
10438         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10439                            current char */
10440         nextchar(pRExC_state);
10441         *node_p = reg_node(pRExC_state, REG_ANY);
10442         *flagp |= HASWIDTH|SIMPLE;
10443         RExC_naughty++;
10444         Set_Node_Length(*node_p, 1); /* MJD */
10445         return TRUE;
10446     }
10447
10448     /* Here, we have decided it should be a named character or sequence */
10449
10450     /* The test above made sure that the next real character is a '{', but
10451      * under the /x modifier, it could be separated by space (or a comment and
10452      * \n) and this is not allowed (for consistency with \x{...} and the
10453      * tokenizer handling of \N{NAME}). */
10454     if (*RExC_parse != '{') {
10455         vFAIL("Missing braces on \\N{}");
10456     }
10457
10458     RExC_parse++;       /* Skip past the '{' */
10459
10460     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10461         || ! (endbrace == RExC_parse            /* nothing between the {} */
10462               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
10463                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10464     {
10465         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10466         vFAIL("\\N{NAME} must be resolved by the lexer");
10467     }
10468
10469     if (endbrace == RExC_parse) {   /* empty: \N{} */
10470         bool ret = TRUE;
10471         if (node_p) {
10472             *node_p = reg_node(pRExC_state,NOTHING);
10473         }
10474         else if (in_char_class) {
10475             if (SIZE_ONLY && in_char_class) {
10476                 if (strict) {
10477                     RExC_parse++;   /* Position after the "}" */
10478                     vFAIL("Zero length \\N{}");
10479                 }
10480                 else {
10481                     ckWARNreg(RExC_parse,
10482                               "Ignoring zero length \\N{} in character class");
10483                 }
10484             }
10485             ret = FALSE;
10486         }
10487         else {
10488             return FALSE;
10489         }
10490         nextchar(pRExC_state);
10491         return ret;
10492     }
10493
10494     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10495     RExC_parse += 2;    /* Skip past the 'U+' */
10496
10497     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10498
10499     /* Code points are separated by dots.  If none, there is only one code
10500      * point, and is terminated by the brace */
10501     has_multiple_chars = (endchar < endbrace);
10502
10503     if (valuep && (! has_multiple_chars || in_char_class)) {
10504         /* We only pay attention to the first char of
10505         multichar strings being returned in char classes. I kinda wonder
10506         if this makes sense as it does change the behaviour
10507         from earlier versions, OTOH that behaviour was broken
10508         as well. XXX Solution is to recharacterize as
10509         [rest-of-class]|multi1|multi2... */
10510
10511         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10512         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10513             | PERL_SCAN_DISALLOW_PREFIX
10514             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10515
10516         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10517
10518         /* The tokenizer should have guaranteed validity, but it's possible to
10519          * bypass it by using single quoting, so check */
10520         if (length_of_hex == 0
10521             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10522         {
10523             RExC_parse += length_of_hex;        /* Includes all the valid */
10524             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10525                             ? UTF8SKIP(RExC_parse)
10526                             : 1;
10527             /* Guard against malformed utf8 */
10528             if (RExC_parse >= endchar) {
10529                 RExC_parse = endchar;
10530             }
10531             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10532         }
10533
10534         if (in_char_class && has_multiple_chars) {
10535             if (strict) {
10536                 RExC_parse = endbrace;
10537                 vFAIL("\\N{} in character class restricted to one character");
10538             }
10539             else {
10540                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10541             }
10542         }
10543
10544         RExC_parse = endbrace + 1;
10545     }
10546     else if (! node_p || ! has_multiple_chars) {
10547
10548         /* Here, the input is legal, but not according to the caller's
10549          * options.  We fail without advancing the parse, so that the
10550          * caller can try again */
10551         RExC_parse = p;
10552         return FALSE;
10553     }
10554     else {
10555
10556         /* What is done here is to convert this to a sub-pattern of the form
10557          * (?:\x{char1}\x{char2}...)
10558          * and then call reg recursively.  That way, it retains its atomicness,
10559          * while not having to worry about special handling that some code
10560          * points may have.  toke.c has converted the original Unicode values
10561          * to native, so that we can just pass on the hex values unchanged.  We
10562          * do have to set a flag to keep recoding from happening in the
10563          * recursion */
10564
10565         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10566         STRLEN len;
10567         char *orig_end = RExC_end;
10568         I32 flags;
10569
10570         while (RExC_parse < endbrace) {
10571
10572             /* Convert to notation the rest of the code understands */
10573             sv_catpv(substitute_parse, "\\x{");
10574             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10575             sv_catpv(substitute_parse, "}");
10576
10577             /* Point to the beginning of the next character in the sequence. */
10578             RExC_parse = endchar + 1;
10579             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10580         }
10581         sv_catpv(substitute_parse, ")");
10582
10583         RExC_parse = SvPV(substitute_parse, len);
10584
10585         /* Don't allow empty number */
10586         if (len < 8) {
10587             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10588         }
10589         RExC_end = RExC_parse + len;
10590
10591         /* The values are Unicode, and therefore not subject to recoding */
10592         RExC_override_recoding = 1;
10593
10594         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10595             if (flags & RESTART_UTF8) {
10596                 *flagp = RESTART_UTF8;
10597                 return FALSE;
10598             }
10599             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10600                   (UV) flags);
10601         }
10602         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10603
10604         RExC_parse = endbrace;
10605         RExC_end = orig_end;
10606         RExC_override_recoding = 0;
10607
10608         nextchar(pRExC_state);
10609     }
10610
10611     return TRUE;
10612 }
10613
10614
10615 /*
10616  * reg_recode
10617  *
10618  * It returns the code point in utf8 for the value in *encp.
10619  *    value: a code value in the source encoding
10620  *    encp:  a pointer to an Encode object
10621  *
10622  * If the result from Encode is not a single character,
10623  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10624  */
10625 STATIC UV
10626 S_reg_recode(pTHX_ const char value, SV **encp)
10627 {
10628     STRLEN numlen = 1;
10629     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10630     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10631     const STRLEN newlen = SvCUR(sv);
10632     UV uv = UNICODE_REPLACEMENT;
10633
10634     PERL_ARGS_ASSERT_REG_RECODE;
10635
10636     if (newlen)
10637         uv = SvUTF8(sv)
10638              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10639              : *(U8*)s;
10640
10641     if (!newlen || numlen != newlen) {
10642         uv = UNICODE_REPLACEMENT;
10643         *encp = NULL;
10644     }
10645     return uv;
10646 }
10647
10648 PERL_STATIC_INLINE U8
10649 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10650 {
10651     U8 op;
10652
10653     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10654
10655     if (! FOLD) {
10656         return EXACT;
10657     }
10658
10659     op = get_regex_charset(RExC_flags);
10660     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10661         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10662                  been, so there is no hole */
10663     }
10664
10665     return op + EXACTF;
10666 }
10667
10668 PERL_STATIC_INLINE void
10669 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10670 {
10671     /* This knows the details about sizing an EXACTish node, setting flags for
10672      * it (by setting <*flagp>, and potentially populating it with a single
10673      * character.
10674      *
10675      * If <len> (the length in bytes) is non-zero, this function assumes that
10676      * the node has already been populated, and just does the sizing.  In this
10677      * case <code_point> should be the final code point that has already been
10678      * placed into the node.  This value will be ignored except that under some
10679      * circumstances <*flagp> is set based on it.
10680      *
10681      * If <len> is zero, the function assumes that the node is to contain only
10682      * the single character given by <code_point> and calculates what <len>
10683      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10684      * additionally will populate the node's STRING with <code_point>, if <len>
10685      * is 0.  In both cases <*flagp> is appropriately set
10686      *
10687      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10688      * 255, must be folded (the former only when the rules indicate it can
10689      * match 'ss') */
10690
10691     bool len_passed_in = cBOOL(len != 0);
10692     U8 character[UTF8_MAXBYTES_CASE+1];
10693
10694     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10695
10696     if (! len_passed_in) {
10697         if (UTF) {
10698             if (FOLD && (! LOC || code_point > 255)) {
10699                 _to_uni_fold_flags(code_point,
10700                                    character,
10701                                    &len,
10702                                    FOLD_FLAGS_FULL | ((LOC)
10703                                                      ? FOLD_FLAGS_LOCALE
10704                                                      : (ASCII_FOLD_RESTRICTED)
10705                                                        ? FOLD_FLAGS_NOMIX_ASCII
10706                                                        : 0));
10707             }
10708             else {
10709                 uvchr_to_utf8( character, code_point);
10710                 len = UTF8SKIP(character);
10711             }
10712         }
10713         else if (! FOLD
10714                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10715                  || ASCII_FOLD_RESTRICTED
10716                  || ! AT_LEAST_UNI_SEMANTICS)
10717         {
10718             *character = (U8) code_point;
10719             len = 1;
10720         }
10721         else {
10722             *character = 's';
10723             *(character + 1) = 's';
10724             len = 2;
10725         }
10726     }
10727
10728     if (SIZE_ONLY) {
10729         RExC_size += STR_SZ(len);
10730     }
10731     else {
10732         RExC_emit += STR_SZ(len);
10733         STR_LEN(node) = len;
10734         if (! len_passed_in) {
10735             Copy((char *) character, STRING(node), len, char);
10736         }
10737     }
10738
10739     *flagp |= HASWIDTH;
10740
10741     /* A single character node is SIMPLE, except for the special-cased SHARP S
10742      * under /di. */
10743     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10744         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10745             || ! FOLD || ! DEPENDS_SEMANTICS))
10746     {
10747         *flagp |= SIMPLE;
10748     }
10749 }
10750
10751
10752 /* return atoi(p), unless it's too big to sensibly be a backref,
10753  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10754
10755 static I32
10756 S_backref_value(char *p)
10757 {
10758     char *q = p;
10759
10760     for (;isDIGIT(*q); q++); /* calculate length of num */
10761     if (q - p == 0 || q - p > 9)
10762         return I32_MAX;
10763     return atoi(p);
10764 }
10765
10766
10767 /*
10768  - regatom - the lowest level
10769
10770    Try to identify anything special at the start of the pattern. If there
10771    is, then handle it as required. This may involve generating a single regop,
10772    such as for an assertion; or it may involve recursing, such as to
10773    handle a () structure.
10774
10775    If the string doesn't start with something special then we gobble up
10776    as much literal text as we can.
10777
10778    Once we have been able to handle whatever type of thing started the
10779    sequence, we return.
10780
10781    Note: we have to be careful with escapes, as they can be both literal
10782    and special, and in the case of \10 and friends, context determines which.
10783
10784    A summary of the code structure is:
10785
10786    switch (first_byte) {
10787         cases for each special:
10788             handle this special;
10789             break;
10790         case '\\':
10791             switch (2nd byte) {
10792                 cases for each unambiguous special:
10793                     handle this special;
10794                     break;
10795                 cases for each ambigous special/literal:
10796                     disambiguate;
10797                     if (special)  handle here
10798                     else goto defchar;
10799                 default: // unambiguously literal:
10800                     goto defchar;
10801             }
10802         default:  // is a literal char
10803             // FALL THROUGH
10804         defchar:
10805             create EXACTish node for literal;
10806             while (more input and node isn't full) {
10807                 switch (input_byte) {
10808                    cases for each special;
10809                        make sure parse pointer is set so that the next call to
10810                            regatom will see this special first
10811                        goto loopdone; // EXACTish node terminated by prev. char
10812                    default:
10813                        append char to EXACTISH node;
10814                 }
10815                 get next input byte;
10816             }
10817         loopdone:
10818    }
10819    return the generated node;
10820
10821    Specifically there are two separate switches for handling
10822    escape sequences, with the one for handling literal escapes requiring
10823    a dummy entry for all of the special escapes that are actually handled
10824    by the other.
10825
10826    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10827    TRYAGAIN.
10828    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10829    restarted.
10830    Otherwise does not return NULL.
10831 */
10832
10833 STATIC regnode *
10834 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10835 {
10836     dVAR;
10837     regnode *ret = NULL;
10838     I32 flags = 0;
10839     char *parse_start = RExC_parse;
10840     U8 op;
10841     int invert = 0;
10842
10843     GET_RE_DEBUG_FLAGS_DECL;
10844
10845     *flagp = WORST;             /* Tentatively. */
10846
10847     DEBUG_PARSE("atom");
10848
10849     PERL_ARGS_ASSERT_REGATOM;
10850
10851 tryagain:
10852     switch ((U8)*RExC_parse) {
10853     case '^':
10854         RExC_seen_zerolen++;
10855         nextchar(pRExC_state);
10856         if (RExC_flags & RXf_PMf_MULTILINE)
10857             ret = reg_node(pRExC_state, MBOL);
10858         else if (RExC_flags & RXf_PMf_SINGLELINE)
10859             ret = reg_node(pRExC_state, SBOL);
10860         else
10861             ret = reg_node(pRExC_state, BOL);
10862         Set_Node_Length(ret, 1); /* MJD */
10863         break;
10864     case '$':
10865         nextchar(pRExC_state);
10866         if (*RExC_parse)
10867             RExC_seen_zerolen++;
10868         if (RExC_flags & RXf_PMf_MULTILINE)
10869             ret = reg_node(pRExC_state, MEOL);
10870         else if (RExC_flags & RXf_PMf_SINGLELINE)
10871             ret = reg_node(pRExC_state, SEOL);
10872         else
10873             ret = reg_node(pRExC_state, EOL);
10874         Set_Node_Length(ret, 1); /* MJD */
10875         break;
10876     case '.':
10877         nextchar(pRExC_state);
10878         if (RExC_flags & RXf_PMf_SINGLELINE)
10879             ret = reg_node(pRExC_state, SANY);
10880         else
10881             ret = reg_node(pRExC_state, REG_ANY);
10882         *flagp |= HASWIDTH|SIMPLE;
10883         RExC_naughty++;
10884         Set_Node_Length(ret, 1); /* MJD */
10885         break;
10886     case '[':
10887     {
10888         char * const oregcomp_parse = ++RExC_parse;
10889         ret = regclass(pRExC_state, flagp,depth+1,
10890                        FALSE, /* means parse the whole char class */
10891                        TRUE, /* allow multi-char folds */
10892                        FALSE, /* don't silence non-portable warnings. */
10893                        NULL);
10894         if (*RExC_parse != ']') {
10895             RExC_parse = oregcomp_parse;
10896             vFAIL("Unmatched [");
10897         }
10898         if (ret == NULL) {
10899             if (*flagp & RESTART_UTF8)
10900                 return NULL;
10901             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10902                   (UV) *flagp);
10903         }
10904         nextchar(pRExC_state);
10905         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10906         break;
10907     }
10908     case '(':
10909         nextchar(pRExC_state);
10910         ret = reg(pRExC_state, 2, &flags,depth+1);
10911         if (ret == NULL) {
10912                 if (flags & TRYAGAIN) {
10913                     if (RExC_parse == RExC_end) {
10914                          /* Make parent create an empty node if needed. */
10915                         *flagp |= TRYAGAIN;
10916                         return(NULL);
10917                     }
10918                     goto tryagain;
10919                 }
10920                 if (flags & RESTART_UTF8) {
10921                     *flagp = RESTART_UTF8;
10922                     return NULL;
10923                 }
10924                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10925         }
10926         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10927         break;
10928     case '|':
10929     case ')':
10930         if (flags & TRYAGAIN) {
10931             *flagp |= TRYAGAIN;
10932             return NULL;
10933         }
10934         vFAIL("Internal urp");
10935                                 /* Supposed to be caught earlier. */
10936         break;
10937     case '{':
10938         if (!regcurly(RExC_parse, FALSE)) {
10939             RExC_parse++;
10940             goto defchar;
10941         }
10942         /* FALL THROUGH */
10943     case '?':
10944     case '+':
10945     case '*':
10946         RExC_parse++;
10947         vFAIL("Quantifier follows nothing");
10948         break;
10949     case '\\':
10950         /* Special Escapes
10951
10952            This switch handles escape sequences that resolve to some kind
10953            of special regop and not to literal text. Escape sequnces that
10954            resolve to literal text are handled below in the switch marked
10955            "Literal Escapes".
10956
10957            Every entry in this switch *must* have a corresponding entry
10958            in the literal escape switch. However, the opposite is not
10959            required, as the default for this switch is to jump to the
10960            literal text handling code.
10961         */
10962         switch ((U8)*++RExC_parse) {
10963             U8 arg;
10964         /* Special Escapes */
10965         case 'A':
10966             RExC_seen_zerolen++;
10967             ret = reg_node(pRExC_state, SBOL);
10968             *flagp |= SIMPLE;
10969             goto finish_meta_pat;
10970         case 'G':
10971             ret = reg_node(pRExC_state, GPOS);
10972             RExC_seen |= REG_SEEN_GPOS;
10973             *flagp |= SIMPLE;
10974             goto finish_meta_pat;
10975         case 'K':
10976             RExC_seen_zerolen++;
10977             ret = reg_node(pRExC_state, KEEPS);
10978             *flagp |= SIMPLE;
10979             /* XXX:dmq : disabling in-place substitution seems to
10980              * be necessary here to avoid cases of memory corruption, as
10981              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10982              */
10983             RExC_seen |= REG_SEEN_LOOKBEHIND;
10984             goto finish_meta_pat;
10985         case 'Z':
10986             ret = reg_node(pRExC_state, SEOL);
10987             *flagp |= SIMPLE;
10988             RExC_seen_zerolen++;                /* Do not optimize RE away */
10989             goto finish_meta_pat;
10990         case 'z':
10991             ret = reg_node(pRExC_state, EOS);
10992             *flagp |= SIMPLE;
10993             RExC_seen_zerolen++;                /* Do not optimize RE away */
10994             goto finish_meta_pat;
10995         case 'C':
10996             ret = reg_node(pRExC_state, CANY);
10997             RExC_seen |= REG_SEEN_CANY;
10998             *flagp |= HASWIDTH|SIMPLE;
10999             goto finish_meta_pat;
11000         case 'X':
11001             ret = reg_node(pRExC_state, CLUMP);
11002             *flagp |= HASWIDTH;
11003             goto finish_meta_pat;
11004
11005         case 'W':
11006             invert = 1;
11007             /* FALLTHROUGH */
11008         case 'w':
11009             arg = ANYOF_WORDCHAR;
11010             goto join_posix;
11011
11012         case 'b':
11013             RExC_seen_zerolen++;
11014             RExC_seen |= REG_SEEN_LOOKBEHIND;
11015             op = BOUND + get_regex_charset(RExC_flags);
11016             if (op > BOUNDA) {  /* /aa is same as /a */
11017                 op = BOUNDA;
11018             }
11019             ret = reg_node(pRExC_state, op);
11020             FLAGS(ret) = get_regex_charset(RExC_flags);
11021             *flagp |= SIMPLE;
11022             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11023                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
11024             }
11025             goto finish_meta_pat;
11026         case 'B':
11027             RExC_seen_zerolen++;
11028             RExC_seen |= REG_SEEN_LOOKBEHIND;
11029             op = NBOUND + get_regex_charset(RExC_flags);
11030             if (op > NBOUNDA) { /* /aa is same as /a */
11031                 op = NBOUNDA;
11032             }
11033             ret = reg_node(pRExC_state, op);
11034             FLAGS(ret) = get_regex_charset(RExC_flags);
11035             *flagp |= SIMPLE;
11036             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11037                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
11038             }
11039             goto finish_meta_pat;
11040
11041         case 'D':
11042             invert = 1;
11043             /* FALLTHROUGH */
11044         case 'd':
11045             arg = ANYOF_DIGIT;
11046             goto join_posix;
11047
11048         case 'R':
11049             ret = reg_node(pRExC_state, LNBREAK);
11050             *flagp |= HASWIDTH|SIMPLE;
11051             goto finish_meta_pat;
11052
11053         case 'H':
11054             invert = 1;
11055             /* FALLTHROUGH */
11056         case 'h':
11057             arg = ANYOF_BLANK;
11058             op = POSIXU;
11059             goto join_posix_op_known;
11060
11061         case 'V':
11062             invert = 1;
11063             /* FALLTHROUGH */
11064         case 'v':
11065             arg = ANYOF_VERTWS;
11066             op = POSIXU;
11067             goto join_posix_op_known;
11068
11069         case 'S':
11070             invert = 1;
11071             /* FALLTHROUGH */
11072         case 's':
11073             arg = ANYOF_SPACE;
11074
11075         join_posix:
11076
11077             op = POSIXD + get_regex_charset(RExC_flags);
11078             if (op > POSIXA) {  /* /aa is same as /a */
11079                 op = POSIXA;
11080             }
11081
11082         join_posix_op_known:
11083
11084             if (invert) {
11085                 op += NPOSIXD - POSIXD;
11086             }
11087
11088             ret = reg_node(pRExC_state, op);
11089             if (! SIZE_ONLY) {
11090                 FLAGS(ret) = namedclass_to_classnum(arg);
11091             }
11092
11093             *flagp |= HASWIDTH|SIMPLE;
11094             /* FALL THROUGH */
11095
11096          finish_meta_pat:
11097             nextchar(pRExC_state);
11098             Set_Node_Length(ret, 2); /* MJD */
11099             break;
11100         case 'p':
11101         case 'P':
11102             {
11103 #ifdef DEBUGGING
11104                 char* parse_start = RExC_parse - 2;
11105 #endif
11106
11107                 RExC_parse--;
11108
11109                 ret = regclass(pRExC_state, flagp,depth+1,
11110                                TRUE, /* means just parse this element */
11111                                FALSE, /* don't allow multi-char folds */
11112                                FALSE, /* don't silence non-portable warnings.
11113                                          It would be a bug if these returned
11114                                          non-portables */
11115                                NULL);
11116                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11117                    are allowed.  */
11118                 if (!ret)
11119                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11120                           (UV) *flagp);
11121
11122                 RExC_parse--;
11123
11124                 Set_Node_Offset(ret, parse_start + 2);
11125                 Set_Node_Cur_Length(ret, parse_start);
11126                 nextchar(pRExC_state);
11127             }
11128             break;
11129         case 'N':
11130             /* Handle \N and \N{NAME} with multiple code points here and not
11131              * below because it can be multicharacter. join_exact() will join
11132              * them up later on.  Also this makes sure that things like
11133              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11134              * The options to the grok function call causes it to fail if the
11135              * sequence is just a single code point.  We then go treat it as
11136              * just another character in the current EXACT node, and hence it
11137              * gets uniform treatment with all the other characters.  The
11138              * special treatment for quantifiers is not needed for such single
11139              * character sequences */
11140             ++RExC_parse;
11141             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11142                                 FALSE /* not strict */ )) {
11143                 if (*flagp & RESTART_UTF8)
11144                     return NULL;
11145                 RExC_parse--;
11146                 goto defchar;
11147             }
11148             break;
11149         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11150         parse_named_seq:
11151         {
11152             char ch= RExC_parse[1];
11153             if (ch != '<' && ch != '\'' && ch != '{') {
11154                 RExC_parse++;
11155                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11156                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11157             } else {
11158                 /* this pretty much dupes the code for (?P=...) in reg(), if
11159                    you change this make sure you change that */
11160                 char* name_start = (RExC_parse += 2);
11161                 U32 num = 0;
11162                 SV *sv_dat = reg_scan_name(pRExC_state,
11163                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11164                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11165                 if (RExC_parse == name_start || *RExC_parse != ch)
11166                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11167                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11168
11169                 if (!SIZE_ONLY) {
11170                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11171                     RExC_rxi->data->data[num]=(void*)sv_dat;
11172                     SvREFCNT_inc_simple_void(sv_dat);
11173                 }
11174
11175                 RExC_sawback = 1;
11176                 ret = reganode(pRExC_state,
11177                                ((! FOLD)
11178                                  ? NREF
11179                                  : (ASCII_FOLD_RESTRICTED)
11180                                    ? NREFFA
11181                                    : (AT_LEAST_UNI_SEMANTICS)
11182                                      ? NREFFU
11183                                      : (LOC)
11184                                        ? NREFFL
11185                                        : NREFF),
11186                                 num);
11187                 *flagp |= HASWIDTH;
11188
11189                 /* override incorrect value set in reganode MJD */
11190                 Set_Node_Offset(ret, parse_start+1);
11191                 Set_Node_Cur_Length(ret, parse_start);
11192                 nextchar(pRExC_state);
11193
11194             }
11195             break;
11196         }
11197         case 'g':
11198         case '1': case '2': case '3': case '4':
11199         case '5': case '6': case '7': case '8': case '9':
11200             {
11201                 I32 num;
11202                 bool hasbrace = 0;
11203
11204                 if (*RExC_parse == 'g') {
11205                     bool isrel = 0;
11206
11207                     RExC_parse++;
11208                     if (*RExC_parse == '{') {
11209                         RExC_parse++;
11210                         hasbrace = 1;
11211                     }
11212                     if (*RExC_parse == '-') {
11213                         RExC_parse++;
11214                         isrel = 1;
11215                     }
11216                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11217                         if (isrel) RExC_parse--;
11218                         RExC_parse -= 2;
11219                         goto parse_named_seq;
11220                     }
11221
11222                     num = S_backref_value(RExC_parse);
11223                     if (num == 0)
11224                         vFAIL("Reference to invalid group 0");
11225                     else if (num == I32_MAX) {
11226                          if (isDIGIT(*RExC_parse))
11227                             vFAIL("Reference to nonexistent group");
11228                         else
11229                             vFAIL("Unterminated \\g... pattern");
11230                     }
11231
11232                     if (isrel) {
11233                         num = RExC_npar - num;
11234                         if (num < 1)
11235                             vFAIL("Reference to nonexistent or unclosed group");
11236                     }
11237                 }
11238                 else {
11239                     num = S_backref_value(RExC_parse);
11240                     /* bare \NNN might be backref or octal */
11241                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11242                             && *RExC_parse != '8' && *RExC_parse != '9'))
11243                         /* Probably a character specified in octal, e.g. \35 */
11244                         goto defchar;
11245                 }
11246
11247                 /* at this point RExC_parse definitely points to a backref
11248                  * number */
11249                 {
11250 #ifdef RE_TRACK_PATTERN_OFFSETS
11251                     char * const parse_start = RExC_parse - 1; /* MJD */
11252 #endif
11253                     while (isDIGIT(*RExC_parse))
11254                         RExC_parse++;
11255                     if (hasbrace) {
11256                         if (*RExC_parse != '}')
11257                             vFAIL("Unterminated \\g{...} pattern");
11258                         RExC_parse++;
11259                     }
11260                     if (!SIZE_ONLY) {
11261                         if (num > (I32)RExC_rx->nparens)
11262                             vFAIL("Reference to nonexistent group");
11263                     }
11264                     RExC_sawback = 1;
11265                     ret = reganode(pRExC_state,
11266                                    ((! FOLD)
11267                                      ? REF
11268                                      : (ASCII_FOLD_RESTRICTED)
11269                                        ? REFFA
11270                                        : (AT_LEAST_UNI_SEMANTICS)
11271                                          ? REFFU
11272                                          : (LOC)
11273                                            ? REFFL
11274                                            : REFF),
11275                                     num);
11276                     *flagp |= HASWIDTH;
11277
11278                     /* override incorrect value set in reganode MJD */
11279                     Set_Node_Offset(ret, parse_start+1);
11280                     Set_Node_Cur_Length(ret, parse_start);
11281                     RExC_parse--;
11282                     nextchar(pRExC_state);
11283                 }
11284             }
11285             break;
11286         case '\0':
11287             if (RExC_parse >= RExC_end)
11288                 FAIL("Trailing \\");
11289             /* FALL THROUGH */
11290         default:
11291             /* Do not generate "unrecognized" warnings here, we fall
11292                back into the quick-grab loop below */
11293             parse_start--;
11294             goto defchar;
11295         }
11296         break;
11297
11298     case '#':
11299         if (RExC_flags & RXf_PMf_EXTENDED) {
11300             if ( reg_skipcomment( pRExC_state ) )
11301                 goto tryagain;
11302         }
11303         /* FALL THROUGH */
11304
11305     default:
11306
11307             parse_start = RExC_parse - 1;
11308
11309             RExC_parse++;
11310
11311         defchar: {
11312             STRLEN len = 0;
11313             UV ender = 0;
11314             char *p;
11315             char *s;
11316 #define MAX_NODE_STRING_SIZE 127
11317             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11318             char *s0;
11319             U8 upper_parse = MAX_NODE_STRING_SIZE;
11320             STRLEN foldlen;
11321             U8 node_type = compute_EXACTish(pRExC_state);
11322             bool next_is_quantifier;
11323             char * oldp = NULL;
11324
11325             /* We can convert EXACTF nodes to EXACTFU if they contain only
11326              * characters that match identically regardless of the target
11327              * string's UTF8ness.  The reason to do this is that EXACTF is not
11328              * trie-able, EXACTFU is.  (We don't need to figure this out until
11329              * pass 2) */
11330             bool maybe_exactfu = node_type == EXACTF && PASS2;
11331
11332             /* If a folding node contains only code points that don't
11333              * participate in folds, it can be changed into an EXACT node,
11334              * which allows the optimizer more things to look for */
11335             bool maybe_exact;
11336
11337             ret = reg_node(pRExC_state, node_type);
11338
11339             /* In pass1, folded, we use a temporary buffer instead of the
11340              * actual node, as the node doesn't exist yet */
11341             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11342
11343             s0 = s;
11344
11345         reparse:
11346
11347             /* We do the EXACTFish to EXACT node only if folding, and not if in
11348              * locale, as whether a character folds or not isn't known until
11349              * runtime.  (And we don't need to figure this out until pass 2) */
11350             maybe_exact = FOLD && ! LOC && PASS2;
11351
11352             /* XXX The node can hold up to 255 bytes, yet this only goes to
11353              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11354              * 255 allows us to not have to worry about overflow due to
11355              * converting to utf8 and fold expansion, but that value is
11356              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11357              * split up by this limit into a single one using the real max of
11358              * 255.  Even at 127, this breaks under rare circumstances.  If
11359              * folding, we do not want to split a node at a character that is a
11360              * non-final in a multi-char fold, as an input string could just
11361              * happen to want to match across the node boundary.  The join
11362              * would solve that problem if the join actually happens.  But a
11363              * series of more than two nodes in a row each of 127 would cause
11364              * the first join to succeed to get to 254, but then there wouldn't
11365              * be room for the next one, which could at be one of those split
11366              * multi-char folds.  I don't know of any fool-proof solution.  One
11367              * could back off to end with only a code point that isn't such a
11368              * non-final, but it is possible for there not to be any in the
11369              * entire node. */
11370             for (p = RExC_parse - 1;
11371                  len < upper_parse && p < RExC_end;
11372                  len++)
11373             {
11374                 oldp = p;
11375
11376                 if (RExC_flags & RXf_PMf_EXTENDED)
11377                     p = regwhite( pRExC_state, p );
11378                 switch ((U8)*p) {
11379                 case '^':
11380                 case '$':
11381                 case '.':
11382                 case '[':
11383                 case '(':
11384                 case ')':
11385                 case '|':
11386                     goto loopdone;
11387                 case '\\':
11388                     /* Literal Escapes Switch
11389
11390                        This switch is meant to handle escape sequences that
11391                        resolve to a literal character.
11392
11393                        Every escape sequence that represents something
11394                        else, like an assertion or a char class, is handled
11395                        in the switch marked 'Special Escapes' above in this
11396                        routine, but also has an entry here as anything that
11397                        isn't explicitly mentioned here will be treated as
11398                        an unescaped equivalent literal.
11399                     */
11400
11401                     switch ((U8)*++p) {
11402                     /* These are all the special escapes. */
11403                     case 'A':             /* Start assertion */
11404                     case 'b': case 'B':   /* Word-boundary assertion*/
11405                     case 'C':             /* Single char !DANGEROUS! */
11406                     case 'd': case 'D':   /* digit class */
11407                     case 'g': case 'G':   /* generic-backref, pos assertion */
11408                     case 'h': case 'H':   /* HORIZWS */
11409                     case 'k': case 'K':   /* named backref, keep marker */
11410                     case 'p': case 'P':   /* Unicode property */
11411                               case 'R':   /* LNBREAK */
11412                     case 's': case 'S':   /* space class */
11413                     case 'v': case 'V':   /* VERTWS */
11414                     case 'w': case 'W':   /* word class */
11415                     case 'X':             /* eXtended Unicode "combining character sequence" */
11416                     case 'z': case 'Z':   /* End of line/string assertion */
11417                         --p;
11418                         goto loopdone;
11419
11420                     /* Anything after here is an escape that resolves to a
11421                        literal. (Except digits, which may or may not)
11422                      */
11423                     case 'n':
11424                         ender = '\n';
11425                         p++;
11426                         break;
11427                     case 'N': /* Handle a single-code point named character. */
11428                         /* The options cause it to fail if a multiple code
11429                          * point sequence.  Handle those in the switch() above
11430                          * */
11431                         RExC_parse = p + 1;
11432                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11433                                             flagp, depth, FALSE,
11434                                             FALSE /* not strict */ ))
11435                         {
11436                             if (*flagp & RESTART_UTF8)
11437                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11438                             RExC_parse = p = oldp;
11439                             goto loopdone;
11440                         }
11441                         p = RExC_parse;
11442                         if (ender > 0xff) {
11443                             REQUIRE_UTF8;
11444                         }
11445                         break;
11446                     case 'r':
11447                         ender = '\r';
11448                         p++;
11449                         break;
11450                     case 't':
11451                         ender = '\t';
11452                         p++;
11453                         break;
11454                     case 'f':
11455                         ender = '\f';
11456                         p++;
11457                         break;
11458                     case 'e':
11459                           ender = ASCII_TO_NATIVE('\033');
11460                         p++;
11461                         break;
11462                     case 'a':
11463                           ender = '\a';
11464                         p++;
11465                         break;
11466                     case 'o':
11467                         {
11468                             UV result;
11469                             const char* error_msg;
11470
11471                             bool valid = grok_bslash_o(&p,
11472                                                        &result,
11473                                                        &error_msg,
11474                                                        TRUE, /* out warnings */
11475                                                        FALSE, /* not strict */
11476                                                        TRUE, /* Output warnings
11477                                                                 for non-
11478                                                                 portables */
11479                                                        UTF);
11480                             if (! valid) {
11481                                 RExC_parse = p; /* going to die anyway; point
11482                                                    to exact spot of failure */
11483                                 vFAIL(error_msg);
11484                             }
11485                             ender = result;
11486                             if (PL_encoding && ender < 0x100) {
11487                                 goto recode_encoding;
11488                             }
11489                             if (ender > 0xff) {
11490                                 REQUIRE_UTF8;
11491                             }
11492                             break;
11493                         }
11494                     case 'x':
11495                         {
11496                             UV result = UV_MAX; /* initialize to erroneous
11497                                                    value */
11498                             const char* error_msg;
11499
11500                             bool valid = grok_bslash_x(&p,
11501                                                        &result,
11502                                                        &error_msg,
11503                                                        TRUE, /* out warnings */
11504                                                        FALSE, /* not strict */
11505                                                        TRUE, /* Output warnings
11506                                                                 for non-
11507                                                                 portables */
11508                                                        UTF);
11509                             if (! valid) {
11510                                 RExC_parse = p; /* going to die anyway; point
11511                                                    to exact spot of failure */
11512                                 vFAIL(error_msg);
11513                             }
11514                             ender = result;
11515
11516                             if (PL_encoding && ender < 0x100) {
11517                                 goto recode_encoding;
11518                             }
11519                             if (ender > 0xff) {
11520                                 REQUIRE_UTF8;
11521                             }
11522                             break;
11523                         }
11524                     case 'c':
11525                         p++;
11526                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11527                         break;
11528                     case '8': case '9': /* must be a backreference */
11529                         --p;
11530                         goto loopdone;
11531                     case '1': case '2': case '3':case '4':
11532                     case '5': case '6': case '7':
11533                         /* When we parse backslash escapes there is ambiguity
11534                          * between backreferences and octal escapes. Any escape
11535                          * from \1 - \9 is a backreference, any multi-digit
11536                          * escape which does not start with 0 and which when
11537                          * evaluated as decimal could refer to an already
11538                          * parsed capture buffer is a backslash. Anything else
11539                          * is octal.
11540                          *
11541                          * Note this implies that \118 could be interpreted as
11542                          * 118 OR as "\11" . "8" depending on whether there
11543                          * were 118 capture buffers defined already in the
11544                          * pattern.  */
11545                         if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11546                         {  /* Not to be treated as an octal constant, go
11547                                    find backref */
11548                             --p;
11549                             goto loopdone;
11550                         }
11551                     case '0':
11552                         {
11553                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11554                             STRLEN numlen = 3;
11555                             ender = grok_oct(p, &numlen, &flags, NULL);
11556                             if (ender > 0xff) {
11557                                 REQUIRE_UTF8;
11558                             }
11559                             p += numlen;
11560                             if (SIZE_ONLY   /* like \08, \178 */
11561                                 && numlen < 3
11562                                 && p < RExC_end
11563                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11564                             {
11565                                 reg_warn_non_literal_string(
11566                                          p + 1,
11567                                          form_short_octal_warning(p, numlen));
11568                             }
11569                         }
11570                         if (PL_encoding && ender < 0x100)
11571                             goto recode_encoding;
11572                         break;
11573                     recode_encoding:
11574                         if (! RExC_override_recoding) {
11575                             SV* enc = PL_encoding;
11576                             ender = reg_recode((const char)(U8)ender, &enc);
11577                             if (!enc && SIZE_ONLY)
11578                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11579                             REQUIRE_UTF8;
11580                         }
11581                         break;
11582                     case '\0':
11583                         if (p >= RExC_end)
11584                             FAIL("Trailing \\");
11585                         /* FALL THROUGH */
11586                     default:
11587                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11588                             /* Include any { following the alpha to emphasize
11589                              * that it could be part of an escape at some point
11590                              * in the future */
11591                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11592                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11593                         }
11594                         goto normal_default;
11595                     } /* End of switch on '\' */
11596                     break;
11597                 default:    /* A literal character */
11598
11599                     if (! SIZE_ONLY
11600                         && RExC_flags & RXf_PMf_EXTENDED
11601                         && ckWARN_d(WARN_DEPRECATED)
11602                         && is_PATWS_non_low(p, UTF))
11603                     {
11604                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11605                                 "Escape literal pattern white space under /x");
11606                     }
11607
11608                   normal_default:
11609                     if (UTF8_IS_START(*p) && UTF) {
11610                         STRLEN numlen;
11611                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11612                                                &numlen, UTF8_ALLOW_DEFAULT);
11613                         p += numlen;
11614                     }
11615                     else
11616                         ender = (U8) *p++;
11617                     break;
11618                 } /* End of switch on the literal */
11619
11620                 /* Here, have looked at the literal character and <ender>
11621                  * contains its ordinal, <p> points to the character after it
11622                  */
11623
11624                 if ( RExC_flags & RXf_PMf_EXTENDED)
11625                     p = regwhite( pRExC_state, p );
11626
11627                 /* If the next thing is a quantifier, it applies to this
11628                  * character only, which means that this character has to be in
11629                  * its own node and can't just be appended to the string in an
11630                  * existing node, so if there are already other characters in
11631                  * the node, close the node with just them, and set up to do
11632                  * this character again next time through, when it will be the
11633                  * only thing in its new node */
11634                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11635                 {
11636                     p = oldp;
11637                     goto loopdone;
11638                 }
11639
11640                 if (! FOLD) {
11641                     if (UTF) {
11642                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11643                         if (unilen > 0) {
11644                            s   += unilen;
11645                            len += unilen;
11646                         }
11647
11648                         /* The loop increments <len> each time, as all but this
11649                          * path (and one other) through it add a single byte to
11650                          * the EXACTish node.  But this one has changed len to
11651                          * be the correct final value, so subtract one to
11652                          * cancel out the increment that follows */
11653                         len--;
11654                     }
11655                     else {
11656                         REGC((char)ender, s++);
11657                     }
11658                 }
11659                 else /* FOLD */ if (! ( UTF
11660                         /* See comments for join_exact() as to why we fold this
11661                          * non-UTF at compile time */
11662                         || (node_type == EXACTFU
11663                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11664                 {
11665                     if (IS_IN_SOME_FOLD_L1(ender)) {
11666                         maybe_exact = FALSE;
11667
11668                         /* See if the character's fold differs between /d and
11669                          * /u.  This includes the multi-char fold SHARP S to
11670                          * 'ss' */
11671                         if (maybe_exactfu
11672                             && (PL_fold[ender] != PL_fold_latin1[ender]
11673                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11674                                 || (len > 0
11675                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11676                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11677                         {
11678                             maybe_exactfu = FALSE;
11679                         }
11680                     }
11681                     *(s++) = (char) ender;
11682                 }
11683                 else {  /* UTF */
11684
11685                     /* Prime the casefolded buffer.  Locale rules, which apply
11686                      * only to code points < 256, aren't known until execution,
11687                      * so for them, just output the original character using
11688                      * utf8.  If we start to fold non-UTF patterns, be sure to
11689                      * update join_exact() */
11690                     if (LOC && ender < 256) {
11691                         if (UVCHR_IS_INVARIANT(ender)) {
11692                             *s = (U8) ender;
11693                             foldlen = 1;
11694                         } else {
11695                             *s = UTF8_TWO_BYTE_HI(ender);
11696                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11697                             foldlen = 2;
11698                         }
11699                     }
11700                     else {
11701                         UV folded = _to_uni_fold_flags(
11702                                        ender,
11703                                        (U8 *) s,
11704                                        &foldlen,
11705                                        FOLD_FLAGS_FULL
11706                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11707                                                 : (ASCII_FOLD_RESTRICTED)
11708                                                   ? FOLD_FLAGS_NOMIX_ASCII
11709                                                   : 0)
11710                                         );
11711
11712                         /* If this node only contains non-folding code points
11713                          * so far, see if this new one is also non-folding */
11714                         if (maybe_exact) {
11715                             if (folded != ender) {
11716                                 maybe_exact = FALSE;
11717                             }
11718                             else {
11719                                 /* Here the fold is the original; we have
11720                                  * to check further to see if anything
11721                                  * folds to it */
11722                                 if (! PL_utf8_foldable) {
11723                                     SV* swash = swash_init("utf8",
11724                                                        "_Perl_Any_Folds",
11725                                                        &PL_sv_undef, 1, 0);
11726                                     PL_utf8_foldable =
11727                                                 _get_swash_invlist(swash);
11728                                     SvREFCNT_dec_NN(swash);
11729                                 }
11730                                 if (_invlist_contains_cp(PL_utf8_foldable,
11731                                                          ender))
11732                                 {
11733                                     maybe_exact = FALSE;
11734                                 }
11735                             }
11736                         }
11737                         ender = folded;
11738                     }
11739                     s += foldlen;
11740
11741                     /* The loop increments <len> each time, as all but this
11742                      * path (and one other) through it add a single byte to the
11743                      * EXACTish node.  But this one has changed len to be the
11744                      * correct final value, so subtract one to cancel out the
11745                      * increment that follows */
11746                     len += foldlen - 1;
11747                 }
11748
11749                 if (next_is_quantifier) {
11750
11751                     /* Here, the next input is a quantifier, and to get here,
11752                      * the current character is the only one in the node.
11753                      * Also, here <len> doesn't include the final byte for this
11754                      * character */
11755                     len++;
11756                     goto loopdone;
11757                 }
11758
11759             } /* End of loop through literal characters */
11760
11761             /* Here we have either exhausted the input or ran out of room in
11762              * the node.  (If we encountered a character that can't be in the
11763              * node, transfer is made directly to <loopdone>, and so we
11764              * wouldn't have fallen off the end of the loop.)  In the latter
11765              * case, we artificially have to split the node into two, because
11766              * we just don't have enough space to hold everything.  This
11767              * creates a problem if the final character participates in a
11768              * multi-character fold in the non-final position, as a match that
11769              * should have occurred won't, due to the way nodes are matched,
11770              * and our artificial boundary.  So back off until we find a non-
11771              * problematic character -- one that isn't at the beginning or
11772              * middle of such a fold.  (Either it doesn't participate in any
11773              * folds, or appears only in the final position of all the folds it
11774              * does participate in.)  A better solution with far fewer false
11775              * positives, and that would fill the nodes more completely, would
11776              * be to actually have available all the multi-character folds to
11777              * test against, and to back-off only far enough to be sure that
11778              * this node isn't ending with a partial one.  <upper_parse> is set
11779              * further below (if we need to reparse the node) to include just
11780              * up through that final non-problematic character that this code
11781              * identifies, so when it is set to less than the full node, we can
11782              * skip the rest of this */
11783             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11784
11785                 const STRLEN full_len = len;
11786
11787                 assert(len >= MAX_NODE_STRING_SIZE);
11788
11789                 /* Here, <s> points to the final byte of the final character.
11790                  * Look backwards through the string until find a non-
11791                  * problematic character */
11792
11793                 if (! UTF) {
11794
11795                     /* These two have no multi-char folds to non-UTF characters
11796                      */
11797                     if (ASCII_FOLD_RESTRICTED || LOC) {
11798                         goto loopdone;
11799                     }
11800
11801                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11802                     len = s - s0 + 1;
11803                 }
11804                 else {
11805                     if (!  PL_NonL1NonFinalFold) {
11806                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11807                                         NonL1_Perl_Non_Final_Folds_invlist);
11808                     }
11809
11810                     /* Point to the first byte of the final character */
11811                     s = (char *) utf8_hop((U8 *) s, -1);
11812
11813                     while (s >= s0) {   /* Search backwards until find
11814                                            non-problematic char */
11815                         if (UTF8_IS_INVARIANT(*s)) {
11816
11817                             /* There are no ascii characters that participate
11818                              * in multi-char folds under /aa.  In EBCDIC, the
11819                              * non-ascii invariants are all control characters,
11820                              * so don't ever participate in any folds. */
11821                             if (ASCII_FOLD_RESTRICTED
11822                                 || ! IS_NON_FINAL_FOLD(*s))
11823                             {
11824                                 break;
11825                             }
11826                         }
11827                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11828
11829                             /* No Latin1 characters participate in multi-char
11830                              * folds under /l */
11831                             if (LOC
11832                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11833                                                                   *s, *(s+1))))
11834                             {
11835                                 break;
11836                             }
11837                         }
11838                         else if (! _invlist_contains_cp(
11839                                         PL_NonL1NonFinalFold,
11840                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11841                         {
11842                             break;
11843                         }
11844
11845                         /* Here, the current character is problematic in that
11846                          * it does occur in the non-final position of some
11847                          * fold, so try the character before it, but have to
11848                          * special case the very first byte in the string, so
11849                          * we don't read outside the string */
11850                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11851                     } /* End of loop backwards through the string */
11852
11853                     /* If there were only problematic characters in the string,
11854                      * <s> will point to before s0, in which case the length
11855                      * should be 0, otherwise include the length of the
11856                      * non-problematic character just found */
11857                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11858                 }
11859
11860                 /* Here, have found the final character, if any, that is
11861                  * non-problematic as far as ending the node without splitting
11862                  * it across a potential multi-char fold.  <len> contains the
11863                  * number of bytes in the node up-to and including that
11864                  * character, or is 0 if there is no such character, meaning
11865                  * the whole node contains only problematic characters.  In
11866                  * this case, give up and just take the node as-is.  We can't
11867                  * do any better */
11868                 if (len == 0) {
11869                     len = full_len;
11870
11871                     /* If the node ends in an 's' we make sure it stays EXACTF,
11872                      * as if it turns into an EXACTFU, it could later get
11873                      * joined with another 's' that would then wrongly match
11874                      * the sharp s */
11875                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11876                     {
11877                         maybe_exactfu = FALSE;
11878                     }
11879                 } else {
11880
11881                     /* Here, the node does contain some characters that aren't
11882                      * problematic.  If one such is the final character in the
11883                      * node, we are done */
11884                     if (len == full_len) {
11885                         goto loopdone;
11886                     }
11887                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11888
11889                         /* If the final character is problematic, but the
11890                          * penultimate is not, back-off that last character to
11891                          * later start a new node with it */
11892                         p = oldp;
11893                         goto loopdone;
11894                     }
11895
11896                     /* Here, the final non-problematic character is earlier
11897                      * in the input than the penultimate character.  What we do
11898                      * is reparse from the beginning, going up only as far as
11899                      * this final ok one, thus guaranteeing that the node ends
11900                      * in an acceptable character.  The reason we reparse is
11901                      * that we know how far in the character is, but we don't
11902                      * know how to correlate its position with the input parse.
11903                      * An alternate implementation would be to build that
11904                      * correlation as we go along during the original parse,
11905                      * but that would entail extra work for every node, whereas
11906                      * this code gets executed only when the string is too
11907                      * large for the node, and the final two characters are
11908                      * problematic, an infrequent occurrence.  Yet another
11909                      * possible strategy would be to save the tail of the
11910                      * string, and the next time regatom is called, initialize
11911                      * with that.  The problem with this is that unless you
11912                      * back off one more character, you won't be guaranteed
11913                      * regatom will get called again, unless regbranch,
11914                      * regpiece ... are also changed.  If you do back off that
11915                      * extra character, so that there is input guaranteed to
11916                      * force calling regatom, you can't handle the case where
11917                      * just the first character in the node is acceptable.  I
11918                      * (khw) decided to try this method which doesn't have that
11919                      * pitfall; if performance issues are found, we can do a
11920                      * combination of the current approach plus that one */
11921                     upper_parse = len;
11922                     len = 0;
11923                     s = s0;
11924                     goto reparse;
11925                 }
11926             }   /* End of verifying node ends with an appropriate char */
11927
11928         loopdone:   /* Jumped to when encounters something that shouldn't be in
11929                        the node */
11930
11931             /* I (khw) don't know if you can get here with zero length, but the
11932              * old code handled this situation by creating a zero-length EXACT
11933              * node.  Might as well be NOTHING instead */
11934             if (len == 0) {
11935                 OP(ret) = NOTHING;
11936             }
11937             else {
11938                 if (FOLD) {
11939                     /* If 'maybe_exact' is still set here, means there are no
11940                      * code points in the node that participate in folds;
11941                      * similarly for 'maybe_exactfu' and code points that match
11942                      * differently depending on UTF8ness of the target string
11943                      * */
11944                     if (maybe_exact) {
11945                         OP(ret) = EXACT;
11946                     }
11947                     else if (maybe_exactfu) {
11948                         OP(ret) = EXACTFU;
11949                     }
11950                 }
11951                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11952             }
11953
11954             RExC_parse = p - 1;
11955             Set_Node_Cur_Length(ret, parse_start);
11956             nextchar(pRExC_state);
11957             {
11958                 /* len is STRLEN which is unsigned, need to copy to signed */
11959                 IV iv = len;
11960                 if (iv < 0)
11961                     vFAIL("Internal disaster");
11962             }
11963
11964         } /* End of label 'defchar:' */
11965         break;
11966     } /* End of giant switch on input character */
11967
11968     return(ret);
11969 }
11970
11971 STATIC char *
11972 S_regwhite( RExC_state_t *pRExC_state, char *p )
11973 {
11974     const char *e = RExC_end;
11975
11976     PERL_ARGS_ASSERT_REGWHITE;
11977
11978     while (p < e) {
11979         if (isSPACE(*p))
11980             ++p;
11981         else if (*p == '#') {
11982             bool ended = 0;
11983             do {
11984                 if (*p++ == '\n') {
11985                     ended = 1;
11986                     break;
11987                 }
11988             } while (p < e);
11989             if (!ended)
11990                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11991         }
11992         else
11993             break;
11994     }
11995     return p;
11996 }
11997
11998 STATIC char *
11999 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12000 {
12001     /* Returns the next non-pattern-white space, non-comment character (the
12002      * latter only if 'recognize_comment is true) in the string p, which is
12003      * ended by RExC_end.  If there is no line break ending a comment,
12004      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
12005     const char *e = RExC_end;
12006
12007     PERL_ARGS_ASSERT_REGPATWS;
12008
12009     while (p < e) {
12010         STRLEN len;
12011         if ((len = is_PATWS_safe(p, e, UTF))) {
12012             p += len;
12013         }
12014         else if (recognize_comment && *p == '#') {
12015             bool ended = 0;
12016             do {
12017                 p++;
12018                 if (is_LNBREAK_safe(p, e, UTF)) {
12019                     ended = 1;
12020                     break;
12021                 }
12022             } while (p < e);
12023             if (!ended)
12024                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12025         }
12026         else
12027             break;
12028     }
12029     return p;
12030 }
12031
12032 STATIC void
12033 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12034 {
12035     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12036      * sets up the bitmap and any flags, removing those code points from the
12037      * inversion list, setting it to NULL should it become completely empty */
12038
12039     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12040     assert(PL_regkind[OP(node)] == ANYOF);
12041
12042     ANYOF_BITMAP_ZERO(node);
12043     if (*invlist_ptr) {
12044
12045         /* This gets set if we actually need to modify things */
12046         bool change_invlist = FALSE;
12047
12048         UV start, end;
12049
12050         /* Start looking through *invlist_ptr */
12051         invlist_iterinit(*invlist_ptr);
12052         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12053             UV high;
12054             int i;
12055
12056             if (end == UV_MAX && start <= 256) {
12057                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12058             }
12059
12060             /* Quit if are above what we should change */
12061             if (start > 255) {
12062                 break;
12063             }
12064
12065             change_invlist = TRUE;
12066
12067             /* Set all the bits in the range, up to the max that we are doing */
12068             high = (end < 255) ? end : 255;
12069             for (i = start; i <= (int) high; i++) {
12070                 if (! ANYOF_BITMAP_TEST(node, i)) {
12071                     ANYOF_BITMAP_SET(node, i);
12072                 }
12073             }
12074         }
12075         invlist_iterfinish(*invlist_ptr);
12076
12077         /* Done with loop; remove any code points that are in the bitmap from
12078          * *invlist_ptr; similarly for code points above latin1 if we have a flag
12079          * to match all of them anyways */
12080         if (change_invlist) {
12081             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12082         }
12083         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12084             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12085         }
12086
12087         /* If have completely emptied it, remove it completely */
12088         if (_invlist_len(*invlist_ptr) == 0) {
12089             SvREFCNT_dec_NN(*invlist_ptr);
12090             *invlist_ptr = NULL;
12091         }
12092     }
12093 }
12094
12095 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12096    Character classes ([:foo:]) can also be negated ([:^foo:]).
12097    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12098    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12099    but trigger failures because they are currently unimplemented. */
12100
12101 #define POSIXCC_DONE(c)   ((c) == ':')
12102 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12103 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12104
12105 PERL_STATIC_INLINE I32
12106 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12107 {
12108     dVAR;
12109     I32 namedclass = OOB_NAMEDCLASS;
12110
12111     PERL_ARGS_ASSERT_REGPPOSIXCC;
12112
12113     if (value == '[' && RExC_parse + 1 < RExC_end &&
12114         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12115         POSIXCC(UCHARAT(RExC_parse)))
12116     {
12117         const char c = UCHARAT(RExC_parse);
12118         char* const s = RExC_parse++;
12119
12120         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12121             RExC_parse++;
12122         if (RExC_parse == RExC_end) {
12123             if (strict) {
12124
12125                 /* Try to give a better location for the error (than the end of
12126                  * the string) by looking for the matching ']' */
12127                 RExC_parse = s;
12128                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12129                     RExC_parse++;
12130                 }
12131                 vFAIL2("Unmatched '%c' in POSIX class", c);
12132             }
12133             /* Grandfather lone [:, [=, [. */
12134             RExC_parse = s;
12135         }
12136         else {
12137             const char* const t = RExC_parse++; /* skip over the c */
12138             assert(*t == c);
12139
12140             if (UCHARAT(RExC_parse) == ']') {
12141                 const char *posixcc = s + 1;
12142                 RExC_parse++; /* skip over the ending ] */
12143
12144                 if (*s == ':') {
12145                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12146                     const I32 skip = t - posixcc;
12147
12148                     /* Initially switch on the length of the name.  */
12149                     switch (skip) {
12150                     case 4:
12151                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12152                                                           this is the Perl \w
12153                                                         */
12154                             namedclass = ANYOF_WORDCHAR;
12155                         break;
12156                     case 5:
12157                         /* Names all of length 5.  */
12158                         /* alnum alpha ascii blank cntrl digit graph lower
12159                            print punct space upper  */
12160                         /* Offset 4 gives the best switch position.  */
12161                         switch (posixcc[4]) {
12162                         case 'a':
12163                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12164                                 namedclass = ANYOF_ALPHA;
12165                             break;
12166                         case 'e':
12167                             if (memEQ(posixcc, "spac", 4)) /* space */
12168                                 namedclass = ANYOF_PSXSPC;
12169                             break;
12170                         case 'h':
12171                             if (memEQ(posixcc, "grap", 4)) /* graph */
12172                                 namedclass = ANYOF_GRAPH;
12173                             break;
12174                         case 'i':
12175                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12176                                 namedclass = ANYOF_ASCII;
12177                             break;
12178                         case 'k':
12179                             if (memEQ(posixcc, "blan", 4)) /* blank */
12180                                 namedclass = ANYOF_BLANK;
12181                             break;
12182                         case 'l':
12183                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12184                                 namedclass = ANYOF_CNTRL;
12185                             break;
12186                         case 'm':
12187                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12188                                 namedclass = ANYOF_ALPHANUMERIC;
12189                             break;
12190                         case 'r':
12191                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12192                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12193                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12194                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12195                             break;
12196                         case 't':
12197                             if (memEQ(posixcc, "digi", 4)) /* digit */
12198                                 namedclass = ANYOF_DIGIT;
12199                             else if (memEQ(posixcc, "prin", 4)) /* print */
12200                                 namedclass = ANYOF_PRINT;
12201                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12202                                 namedclass = ANYOF_PUNCT;
12203                             break;
12204                         }
12205                         break;
12206                     case 6:
12207                         if (memEQ(posixcc, "xdigit", 6))
12208                             namedclass = ANYOF_XDIGIT;
12209                         break;
12210                     }
12211
12212                     if (namedclass == OOB_NAMEDCLASS)
12213                         vFAIL2utf8f(
12214                             "POSIX class [:%"UTF8f":] unknown",
12215                             UTF8fARG(UTF, t - s - 1, s + 1));
12216
12217                     /* The #defines are structured so each complement is +1 to
12218                      * the normal one */
12219                     if (complement) {
12220                         namedclass++;
12221                     }
12222                     assert (posixcc[skip] == ':');
12223                     assert (posixcc[skip+1] == ']');
12224                 } else if (!SIZE_ONLY) {
12225                     /* [[=foo=]] and [[.foo.]] are still future. */
12226
12227                     /* adjust RExC_parse so the warning shows after
12228                        the class closes */
12229                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12230                         RExC_parse++;
12231                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12232                 }
12233             } else {
12234                 /* Maternal grandfather:
12235                  * "[:" ending in ":" but not in ":]" */
12236                 if (strict) {
12237                     vFAIL("Unmatched '[' in POSIX class");
12238                 }
12239
12240                 /* Grandfather lone [:, [=, [. */
12241                 RExC_parse = s;
12242             }
12243         }
12244     }
12245
12246     return namedclass;
12247 }
12248
12249 STATIC bool
12250 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12251 {
12252     /* This applies some heuristics at the current parse position (which should
12253      * be at a '[') to see if what follows might be intended to be a [:posix:]
12254      * class.  It returns true if it really is a posix class, of course, but it
12255      * also can return true if it thinks that what was intended was a posix
12256      * class that didn't quite make it.
12257      *
12258      * It will return true for
12259      *      [:alphanumerics:
12260      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12261      *                         ')' indicating the end of the (?[
12262      *      [:any garbage including %^&$ punctuation:]
12263      *
12264      * This is designed to be called only from S_handle_regex_sets; it could be
12265      * easily adapted to be called from the spot at the beginning of regclass()
12266      * that checks to see in a normal bracketed class if the surrounding []
12267      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12268      * change long-standing behavior, so I (khw) didn't do that */
12269     char* p = RExC_parse + 1;
12270     char first_char = *p;
12271
12272     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12273
12274     assert(*(p - 1) == '[');
12275
12276     if (! POSIXCC(first_char)) {
12277         return FALSE;
12278     }
12279
12280     p++;
12281     while (p < RExC_end && isWORDCHAR(*p)) p++;
12282
12283     if (p >= RExC_end) {
12284         return FALSE;
12285     }
12286
12287     if (p - RExC_parse > 2    /* Got at least 1 word character */
12288         && (*p == first_char
12289             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12290     {
12291         return TRUE;
12292     }
12293
12294     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12295
12296     return (p
12297             && p - RExC_parse > 2 /* [:] evaluates to colon;
12298                                       [::] is a bad posix class. */
12299             && first_char == *(p - 1));
12300 }
12301
12302 STATIC regnode *
12303 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12304                    char * const oregcomp_parse)
12305 {
12306     /* Handle the (?[...]) construct to do set operations */
12307
12308     U8 curchar;
12309     UV start, end;      /* End points of code point ranges */
12310     SV* result_string;
12311     char *save_end, *save_parse;
12312     SV* final;
12313     STRLEN len;
12314     regnode* node;
12315     AV* stack;
12316     const bool save_fold = FOLD;
12317
12318     GET_RE_DEBUG_FLAGS_DECL;
12319
12320     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12321
12322     if (LOC) {
12323         vFAIL("(?[...]) not valid in locale");
12324     }
12325     RExC_uni_semantics = 1;
12326
12327     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12328      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12329      * call regclass to handle '[]' so as to not have to reinvent its parsing
12330      * rules here (throwing away the size it computes each time).  And, we exit
12331      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12332      * these things, we need to realize that something preceded by a backslash
12333      * is escaped, so we have to keep track of backslashes */
12334     if (SIZE_ONLY) {
12335         UV depth = 0; /* how many nested (?[...]) constructs */
12336
12337         Perl_ck_warner_d(aTHX_
12338             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12339             "The regex_sets feature is experimental" REPORT_LOCATION,
12340                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12341                 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12342
12343         while (RExC_parse < RExC_end) {
12344             SV* current = NULL;
12345             RExC_parse = regpatws(pRExC_state, RExC_parse,
12346                                 TRUE); /* means recognize comments */
12347             switch (*RExC_parse) {
12348                 case '?':
12349                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12350                     /* FALL THROUGH */
12351                 default:
12352                     break;
12353                 case '\\':
12354                     /* Skip the next byte (which could cause us to end up in
12355                      * the middle of a UTF-8 character, but since none of those
12356                      * are confusable with anything we currently handle in this
12357                      * switch (invariants all), it's safe.  We'll just hit the
12358                      * default: case next time and keep on incrementing until
12359                      * we find one of the invariants we do handle. */
12360                     RExC_parse++;
12361                     break;
12362                 case '[':
12363                 {
12364                     /* If this looks like it is a [:posix:] class, leave the
12365                      * parse pointer at the '[' to fool regclass() into
12366                      * thinking it is part of a '[[:posix:]]'.  That function
12367                      * will use strict checking to force a syntax error if it
12368                      * doesn't work out to a legitimate class */
12369                     bool is_posix_class
12370                                     = could_it_be_a_POSIX_class(pRExC_state);
12371                     if (! is_posix_class) {
12372                         RExC_parse++;
12373                     }
12374
12375                     /* regclass() can only return RESTART_UTF8 if multi-char
12376                        folds are allowed.  */
12377                     if (!regclass(pRExC_state, flagp,depth+1,
12378                                   is_posix_class, /* parse the whole char
12379                                                      class only if not a
12380                                                      posix class */
12381                                   FALSE, /* don't allow multi-char folds */
12382                                   TRUE, /* silence non-portable warnings. */
12383                                   &current))
12384                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12385                               (UV) *flagp);
12386
12387                     /* function call leaves parse pointing to the ']', except
12388                      * if we faked it */
12389                     if (is_posix_class) {
12390                         RExC_parse--;
12391                     }
12392
12393                     SvREFCNT_dec(current);   /* In case it returned something */
12394                     break;
12395                 }
12396
12397                 case ']':
12398                     if (depth--) break;
12399                     RExC_parse++;
12400                     if (RExC_parse < RExC_end
12401                         && *RExC_parse == ')')
12402                     {
12403                         node = reganode(pRExC_state, ANYOF, 0);
12404                         RExC_size += ANYOF_SKIP;
12405                         nextchar(pRExC_state);
12406                         Set_Node_Length(node,
12407                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12408                         return node;
12409                     }
12410                     goto no_close;
12411             }
12412             RExC_parse++;
12413         }
12414
12415         no_close:
12416         FAIL("Syntax error in (?[...])");
12417     }
12418
12419     /* Pass 2 only after this.  Everything in this construct is a
12420      * metacharacter.  Operands begin with either a '\' (for an escape
12421      * sequence), or a '[' for a bracketed character class.  Any other
12422      * character should be an operator, or parenthesis for grouping.  Both
12423      * types of operands are handled by calling regclass() to parse them.  It
12424      * is called with a parameter to indicate to return the computed inversion
12425      * list.  The parsing here is implemented via a stack.  Each entry on the
12426      * stack is a single character representing one of the operators, or the
12427      * '('; or else a pointer to an operand inversion list. */
12428
12429 #define IS_OPERAND(a)  (! SvIOK(a))
12430
12431     /* The stack starts empty.  It is a syntax error if the first thing parsed
12432      * is a binary operator; everything else is pushed on the stack.  When an
12433      * operand is parsed, the top of the stack is examined.  If it is a binary
12434      * operator, the item before it should be an operand, and both are replaced
12435      * by the result of doing that operation on the new operand and the one on
12436      * the stack.   Thus a sequence of binary operands is reduced to a single
12437      * one before the next one is parsed.
12438      *
12439      * A unary operator may immediately follow a binary in the input, for
12440      * example
12441      *      [a] + ! [b]
12442      * When an operand is parsed and the top of the stack is a unary operator,
12443      * the operation is performed, and then the stack is rechecked to see if
12444      * this new operand is part of a binary operation; if so, it is handled as
12445      * above.
12446      *
12447      * A '(' is simply pushed on the stack; it is valid only if the stack is
12448      * empty, or the top element of the stack is an operator or another '('
12449      * (for which the parenthesized expression will become an operand).  By the
12450      * time the corresponding ')' is parsed everything in between should have
12451      * been parsed and evaluated to a single operand (or else is a syntax
12452      * error), and is handled as a regular operand */
12453
12454     sv_2mortal((SV *)(stack = newAV()));
12455
12456     while (RExC_parse < RExC_end) {
12457         I32 top_index = av_tindex(stack);
12458         SV** top_ptr;
12459         SV* current = NULL;
12460
12461         /* Skip white space */
12462         RExC_parse = regpatws(pRExC_state, RExC_parse,
12463                                 TRUE); /* means recognize comments */
12464         if (RExC_parse >= RExC_end) {
12465             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12466         }
12467         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12468             break;
12469         }
12470
12471         switch (curchar) {
12472
12473             case '?':
12474                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12475                                                safely subtract 1 from
12476                                                RExC_parse in the next clause.
12477                                                If we have something on the
12478                                                stack, we have parsed something
12479                                              */
12480                     && UCHARAT(RExC_parse - 1) == '('
12481                     && RExC_parse < RExC_end)
12482                 {
12483                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12484                      * This happens when we have some thing like
12485                      *
12486                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12487                      *   ...
12488                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12489                      *
12490                      * Here we would be handling the interpolated
12491                      * '$thai_or_lao'.  We handle this by a recursive call to
12492                      * ourselves which returns the inversion list the
12493                      * interpolated expression evaluates to.  We use the flags
12494                      * from the interpolated pattern. */
12495                     U32 save_flags = RExC_flags;
12496                     const char * const save_parse = ++RExC_parse;
12497
12498                     parse_lparen_question_flags(pRExC_state);
12499
12500                     if (RExC_parse == save_parse  /* Makes sure there was at
12501                                                      least one flag (or this
12502                                                      embedding wasn't compiled)
12503                                                    */
12504                         || RExC_parse >= RExC_end - 4
12505                         || UCHARAT(RExC_parse) != ':'
12506                         || UCHARAT(++RExC_parse) != '('
12507                         || UCHARAT(++RExC_parse) != '?'
12508                         || UCHARAT(++RExC_parse) != '[')
12509                     {
12510
12511                         /* In combination with the above, this moves the
12512                          * pointer to the point just after the first erroneous
12513                          * character (or if there are no flags, to where they
12514                          * should have been) */
12515                         if (RExC_parse >= RExC_end - 4) {
12516                             RExC_parse = RExC_end;
12517                         }
12518                         else if (RExC_parse != save_parse) {
12519                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12520                         }
12521                         vFAIL("Expecting '(?flags:(?[...'");
12522                     }
12523                     RExC_parse++;
12524                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12525                                                     depth+1, oregcomp_parse);
12526
12527                     /* Here, 'current' contains the embedded expression's
12528                      * inversion list, and RExC_parse points to the trailing
12529                      * ']'; the next character should be the ')' which will be
12530                      * paired with the '(' that has been put on the stack, so
12531                      * the whole embedded expression reduces to '(operand)' */
12532                     RExC_parse++;
12533
12534                     RExC_flags = save_flags;
12535                     goto handle_operand;
12536                 }
12537                 /* FALL THROUGH */
12538
12539             default:
12540                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12541                 vFAIL("Unexpected character");
12542
12543             case '\\':
12544                 /* regclass() can only return RESTART_UTF8 if multi-char
12545                    folds are allowed.  */
12546                 if (!regclass(pRExC_state, flagp,depth+1,
12547                               TRUE, /* means parse just the next thing */
12548                               FALSE, /* don't allow multi-char folds */
12549                               FALSE, /* don't silence non-portable warnings.  */
12550                               &current))
12551                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12552                           (UV) *flagp);
12553                 /* regclass() will return with parsing just the \ sequence,
12554                  * leaving the parse pointer at the next thing to parse */
12555                 RExC_parse--;
12556                 goto handle_operand;
12557
12558             case '[':   /* Is a bracketed character class */
12559             {
12560                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12561
12562                 if (! is_posix_class) {
12563                     RExC_parse++;
12564                 }
12565
12566                 /* regclass() can only return RESTART_UTF8 if multi-char
12567                    folds are allowed.  */
12568                 if(!regclass(pRExC_state, flagp,depth+1,
12569                              is_posix_class, /* parse the whole char class
12570                                                 only if not a posix class */
12571                              FALSE, /* don't allow multi-char folds */
12572                              FALSE, /* don't silence non-portable warnings.  */
12573                              &current))
12574                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12575                           (UV) *flagp);
12576                 /* function call leaves parse pointing to the ']', except if we
12577                  * faked it */
12578                 if (is_posix_class) {
12579                     RExC_parse--;
12580                 }
12581
12582                 goto handle_operand;
12583             }
12584
12585             case '&':
12586             case '|':
12587             case '+':
12588             case '-':
12589             case '^':
12590                 if (top_index < 0
12591                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12592                     || ! IS_OPERAND(*top_ptr))
12593                 {
12594                     RExC_parse++;
12595                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12596                 }
12597                 av_push(stack, newSVuv(curchar));
12598                 break;
12599
12600             case '!':
12601                 av_push(stack, newSVuv(curchar));
12602                 break;
12603
12604             case '(':
12605                 if (top_index >= 0) {
12606                     top_ptr = av_fetch(stack, top_index, FALSE);
12607                     assert(top_ptr);
12608                     if (IS_OPERAND(*top_ptr)) {
12609                         RExC_parse++;
12610                         vFAIL("Unexpected '(' with no preceding operator");
12611                     }
12612                 }
12613                 av_push(stack, newSVuv(curchar));
12614                 break;
12615
12616             case ')':
12617             {
12618                 SV* lparen;
12619                 if (top_index < 1
12620                     || ! (current = av_pop(stack))
12621                     || ! IS_OPERAND(current)
12622                     || ! (lparen = av_pop(stack))
12623                     || IS_OPERAND(lparen)
12624                     || SvUV(lparen) != '(')
12625                 {
12626                     SvREFCNT_dec(current);
12627                     RExC_parse++;
12628                     vFAIL("Unexpected ')'");
12629                 }
12630                 top_index -= 2;
12631                 SvREFCNT_dec_NN(lparen);
12632
12633                 /* FALL THROUGH */
12634             }
12635
12636               handle_operand:
12637
12638                 /* Here, we have an operand to process, in 'current' */
12639
12640                 if (top_index < 0) {    /* Just push if stack is empty */
12641                     av_push(stack, current);
12642                 }
12643                 else {
12644                     SV* top = av_pop(stack);
12645                     SV *prev = NULL;
12646                     char current_operator;
12647
12648                     if (IS_OPERAND(top)) {
12649                         SvREFCNT_dec_NN(top);
12650                         SvREFCNT_dec_NN(current);
12651                         vFAIL("Operand with no preceding operator");
12652                     }
12653                     current_operator = (char) SvUV(top);
12654                     switch (current_operator) {
12655                         case '(':   /* Push the '(' back on followed by the new
12656                                        operand */
12657                             av_push(stack, top);
12658                             av_push(stack, current);
12659                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12660                                                    just after the 'break', so
12661                                                    it doesn't get wrongly freed
12662                                                  */
12663                             break;
12664
12665                         case '!':
12666                             _invlist_invert(current);
12667
12668                             /* Unlike binary operators, the top of the stack,
12669                              * now that this unary one has been popped off, may
12670                              * legally be an operator, and we now have operand
12671                              * for it. */
12672                             top_index--;
12673                             SvREFCNT_dec_NN(top);
12674                             goto handle_operand;
12675
12676                         case '&':
12677                             prev = av_pop(stack);
12678                             _invlist_intersection(prev,
12679                                                    current,
12680                                                    &current);
12681                             av_push(stack, current);
12682                             break;
12683
12684                         case '|':
12685                         case '+':
12686                             prev = av_pop(stack);
12687                             _invlist_union(prev, current, &current);
12688                             av_push(stack, current);
12689                             break;
12690
12691                         case '-':
12692                             prev = av_pop(stack);;
12693                             _invlist_subtract(prev, current, &current);
12694                             av_push(stack, current);
12695                             break;
12696
12697                         case '^':   /* The union minus the intersection */
12698                         {
12699                             SV* i = NULL;
12700                             SV* u = NULL;
12701                             SV* element;
12702
12703                             prev = av_pop(stack);
12704                             _invlist_union(prev, current, &u);
12705                             _invlist_intersection(prev, current, &i);
12706                             /* _invlist_subtract will overwrite current
12707                                 without freeing what it already contains */
12708                             element = current;
12709                             _invlist_subtract(u, i, &current);
12710                             av_push(stack, current);
12711                             SvREFCNT_dec_NN(i);
12712                             SvREFCNT_dec_NN(u);
12713                             SvREFCNT_dec_NN(element);
12714                             break;
12715                         }
12716
12717                         default:
12718                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12719                 }
12720                 SvREFCNT_dec_NN(top);
12721                 SvREFCNT_dec(prev);
12722             }
12723         }
12724
12725         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12726     }
12727
12728     if (av_tindex(stack) < 0   /* Was empty */
12729         || ((final = av_pop(stack)) == NULL)
12730         || ! IS_OPERAND(final)
12731         || av_tindex(stack) >= 0)  /* More left on stack */
12732     {
12733         vFAIL("Incomplete expression within '(?[ ])'");
12734     }
12735
12736     /* Here, 'final' is the resultant inversion list from evaluating the
12737      * expression.  Return it if so requested */
12738     if (return_invlist) {
12739         *return_invlist = final;
12740         return END;
12741     }
12742
12743     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12744      * expecting a string of ranges and individual code points */
12745     invlist_iterinit(final);
12746     result_string = newSVpvs("");
12747     while (invlist_iternext(final, &start, &end)) {
12748         if (start == end) {
12749             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12750         }
12751         else {
12752             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12753                                                      start,          end);
12754         }
12755     }
12756
12757     save_parse = RExC_parse;
12758     RExC_parse = SvPV(result_string, len);
12759     save_end = RExC_end;
12760     RExC_end = RExC_parse + len;
12761
12762     /* We turn off folding around the call, as the class we have constructed
12763      * already has all folding taken into consideration, and we don't want
12764      * regclass() to add to that */
12765     RExC_flags &= ~RXf_PMf_FOLD;
12766     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12767      */
12768     node = regclass(pRExC_state, flagp,depth+1,
12769                     FALSE, /* means parse the whole char class */
12770                     FALSE, /* don't allow multi-char folds */
12771                     TRUE, /* silence non-portable warnings.  The above may very
12772                              well have generated non-portable code points, but
12773                              they're valid on this machine */
12774                     NULL);
12775     if (!node)
12776         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12777                     PTR2UV(flagp));
12778     if (save_fold) {
12779         RExC_flags |= RXf_PMf_FOLD;
12780     }
12781     RExC_parse = save_parse + 1;
12782     RExC_end = save_end;
12783     SvREFCNT_dec_NN(final);
12784     SvREFCNT_dec_NN(result_string);
12785
12786     nextchar(pRExC_state);
12787     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12788     return node;
12789 }
12790 #undef IS_OPERAND
12791
12792 /* The names of properties whose definitions are not known at compile time are
12793  * stored in this SV, after a constant heading.  So if the length has been
12794  * changed since initialization, then there is a run-time definition. */
12795 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12796
12797 STATIC regnode *
12798 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12799                  const bool stop_at_1,  /* Just parse the next thing, don't
12800                                            look for a full character class */
12801                  bool allow_multi_folds,
12802                  const bool silence_non_portable,   /* Don't output warnings
12803                                                        about too large
12804                                                        characters */
12805                  SV** ret_invlist)  /* Return an inversion list, not a node */
12806 {
12807     /* parse a bracketed class specification.  Most of these will produce an
12808      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12809      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12810      * under /i with multi-character folds: it will be rewritten following the
12811      * paradigm of this example, where the <multi-fold>s are characters which
12812      * fold to multiple character sequences:
12813      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12814      * gets effectively rewritten as:
12815      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12816      * reg() gets called (recursively) on the rewritten version, and this
12817      * function will return what it constructs.  (Actually the <multi-fold>s
12818      * aren't physically removed from the [abcdefghi], it's just that they are
12819      * ignored in the recursion by means of a flag:
12820      * <RExC_in_multi_char_class>.)
12821      *
12822      * ANYOF nodes contain a bit map for the first 256 characters, with the
12823      * corresponding bit set if that character is in the list.  For characters
12824      * above 255, a range list or swash is used.  There are extra bits for \w,
12825      * etc. in locale ANYOFs, as what these match is not determinable at
12826      * compile time
12827      *
12828      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12829      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12830      */
12831
12832     dVAR;
12833     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12834     IV range = 0;
12835     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12836     regnode *ret;
12837     STRLEN numlen;
12838     IV namedclass = OOB_NAMEDCLASS;
12839     char *rangebegin = NULL;
12840     bool need_class = 0;
12841     SV *listsv = NULL;
12842     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12843                                       than just initialized.  */
12844     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12845     SV* posixes = NULL;     /* Code points that match classes like [:word:],
12846                                extended beyond the Latin1 range.  These have to
12847                                be kept separate from other code points for much
12848                                of this function because their handling  is
12849                                different under /i, and for most classes under
12850                                /d as well */
12851     UV element_count = 0;   /* Number of distinct elements in the class.
12852                                Optimizations may be possible if this is tiny */
12853     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12854                                        character; used under /i */
12855     UV n;
12856     char * stop_ptr = RExC_end;    /* where to stop parsing */
12857     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12858                                                    space? */
12859     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12860
12861     /* Unicode properties are stored in a swash; this holds the current one
12862      * being parsed.  If this swash is the only above-latin1 component of the
12863      * character class, an optimization is to pass it directly on to the
12864      * execution engine.  Otherwise, it is set to NULL to indicate that there
12865      * are other things in the class that have to be dealt with at execution
12866      * time */
12867     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12868
12869     /* Set if a component of this character class is user-defined; just passed
12870      * on to the engine */
12871     bool has_user_defined_property = FALSE;
12872
12873     /* inversion list of code points this node matches only when the target
12874      * string is in UTF-8.  (Because is under /d) */
12875     SV* depends_list = NULL;
12876
12877     /* inversion list of code points this node matches.  For much of the
12878      * function, it includes only those that match regardless of the utf8ness
12879      * of the target string */
12880     SV* cp_list = NULL;
12881
12882 #ifdef EBCDIC
12883     /* In a range, counts how many 0-2 of the ends of it came from literals,
12884      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12885     UV literal_endpoint = 0;
12886 #endif
12887     bool invert = FALSE;    /* Is this class to be complemented */
12888
12889     /* Is there any thing like \W or [:^digit:] that matches above the legal
12890      * Unicode range? */
12891     bool runtime_posix_matches_above_Unicode = FALSE;
12892
12893     bool warn_super = ALWAYS_WARN_SUPER;
12894
12895     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12896         case we need to change the emitted regop to an EXACT. */
12897     const char * orig_parse = RExC_parse;
12898     const SSize_t orig_size = RExC_size;
12899     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12900     GET_RE_DEBUG_FLAGS_DECL;
12901
12902     PERL_ARGS_ASSERT_REGCLASS;
12903 #ifndef DEBUGGING
12904     PERL_UNUSED_ARG(depth);
12905 #endif
12906
12907     DEBUG_PARSE("clas");
12908
12909     /* Assume we are going to generate an ANYOF node. */
12910     ret = reganode(pRExC_state, ANYOF, 0);
12911
12912     if (SIZE_ONLY) {
12913         RExC_size += ANYOF_SKIP;
12914         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12915     }
12916     else {
12917         ANYOF_FLAGS(ret) = 0;
12918
12919         RExC_emit += ANYOF_SKIP;
12920         if (LOC) {
12921             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12922         }
12923         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12924         initial_listsv_len = SvCUR(listsv);
12925         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12926     }
12927
12928     if (skip_white) {
12929         RExC_parse = regpatws(pRExC_state, RExC_parse,
12930                               FALSE /* means don't recognize comments */);
12931     }
12932
12933     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12934         RExC_parse++;
12935         invert = TRUE;
12936         allow_multi_folds = FALSE;
12937         RExC_naughty++;
12938         if (skip_white) {
12939             RExC_parse = regpatws(pRExC_state, RExC_parse,
12940                                   FALSE /* means don't recognize comments */);
12941         }
12942     }
12943
12944     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12945     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12946         const char *s = RExC_parse;
12947         const char  c = *s++;
12948
12949         while (isWORDCHAR(*s))
12950             s++;
12951         if (*s && c == *s && s[1] == ']') {
12952             SAVEFREESV(RExC_rx_sv);
12953             ckWARN3reg(s+2,
12954                        "POSIX syntax [%c %c] belongs inside character classes",
12955                        c, c);
12956             (void)ReREFCNT_inc(RExC_rx_sv);
12957         }
12958     }
12959
12960     /* If the caller wants us to just parse a single element, accomplish this
12961      * by faking the loop ending condition */
12962     if (stop_at_1 && RExC_end > RExC_parse) {
12963         stop_ptr = RExC_parse + 1;
12964     }
12965
12966     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12967     if (UCHARAT(RExC_parse) == ']')
12968         goto charclassloop;
12969
12970 parseit:
12971     while (1) {
12972         if  (RExC_parse >= stop_ptr) {
12973             break;
12974         }
12975
12976         if (skip_white) {
12977             RExC_parse = regpatws(pRExC_state, RExC_parse,
12978                                   FALSE /* means don't recognize comments */);
12979         }
12980
12981         if  (UCHARAT(RExC_parse) == ']') {
12982             break;
12983         }
12984
12985     charclassloop:
12986
12987         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12988         save_value = value;
12989         save_prevvalue = prevvalue;
12990
12991         if (!range) {
12992             rangebegin = RExC_parse;
12993             element_count++;
12994         }
12995         if (UTF) {
12996             value = utf8n_to_uvchr((U8*)RExC_parse,
12997                                    RExC_end - RExC_parse,
12998                                    &numlen, UTF8_ALLOW_DEFAULT);
12999             RExC_parse += numlen;
13000         }
13001         else
13002             value = UCHARAT(RExC_parse++);
13003
13004         if (value == '['
13005             && RExC_parse < RExC_end
13006             && POSIXCC(UCHARAT(RExC_parse)))
13007         {
13008             namedclass = regpposixcc(pRExC_state, value, strict);
13009         }
13010         else if (value == '\\') {
13011             if (UTF) {
13012                 value = utf8n_to_uvchr((U8*)RExC_parse,
13013                                    RExC_end - RExC_parse,
13014                                    &numlen, UTF8_ALLOW_DEFAULT);
13015                 RExC_parse += numlen;
13016             }
13017             else
13018                 value = UCHARAT(RExC_parse++);
13019
13020             /* Some compilers cannot handle switching on 64-bit integer
13021              * values, therefore value cannot be an UV.  Yes, this will
13022              * be a problem later if we want switch on Unicode.
13023              * A similar issue a little bit later when switching on
13024              * namedclass. --jhi */
13025
13026             /* If the \ is escaping white space when white space is being
13027              * skipped, it means that that white space is wanted literally, and
13028              * is already in 'value'.  Otherwise, need to translate the escape
13029              * into what it signifies. */
13030             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13031
13032             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13033             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13034             case 's':   namedclass = ANYOF_SPACE;       break;
13035             case 'S':   namedclass = ANYOF_NSPACE;      break;
13036             case 'd':   namedclass = ANYOF_DIGIT;       break;
13037             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13038             case 'v':   namedclass = ANYOF_VERTWS;      break;
13039             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13040             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13041             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13042             case 'N':  /* Handle \N{NAME} in class */
13043                 {
13044                     /* We only pay attention to the first char of
13045                     multichar strings being returned. I kinda wonder
13046                     if this makes sense as it does change the behaviour
13047                     from earlier versions, OTOH that behaviour was broken
13048                     as well. */
13049                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13050                                       TRUE, /* => charclass */
13051                                       strict))
13052                     {
13053                         if (*flagp & RESTART_UTF8)
13054                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13055                         goto parseit;
13056                     }
13057                 }
13058                 break;
13059             case 'p':
13060             case 'P':
13061                 {
13062                 char *e;
13063
13064                 /* We will handle any undefined properties ourselves */
13065                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13066                                        /* And we actually would prefer to get
13067                                         * the straight inversion list of the
13068                                         * swash, since we will be accessing it
13069                                         * anyway, to save a little time */
13070                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13071
13072                 if (RExC_parse >= RExC_end)
13073                     vFAIL2("Empty \\%c{}", (U8)value);
13074                 if (*RExC_parse == '{') {
13075                     const U8 c = (U8)value;
13076                     e = strchr(RExC_parse++, '}');
13077                     if (!e)
13078                         vFAIL2("Missing right brace on \\%c{}", c);
13079                     while (isSPACE(UCHARAT(RExC_parse)))
13080                         RExC_parse++;
13081                     if (e == RExC_parse)
13082                         vFAIL2("Empty \\%c{}", c);
13083                     n = e - RExC_parse;
13084                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13085                         n--;
13086                 }
13087                 else {
13088                     e = RExC_parse;
13089                     n = 1;
13090                 }
13091                 if (!SIZE_ONLY) {
13092                     SV* invlist;
13093                     char* formatted;
13094                     char* name;
13095
13096                     if (UCHARAT(RExC_parse) == '^') {
13097                          RExC_parse++;
13098                          n--;
13099                          /* toggle.  (The rhs xor gets the single bit that
13100                           * differs between P and p; the other xor inverts just
13101                           * that bit) */
13102                          value ^= 'P' ^ 'p';
13103
13104                          while (isSPACE(UCHARAT(RExC_parse))) {
13105                               RExC_parse++;
13106                               n--;
13107                          }
13108                     }
13109                     /* Try to get the definition of the property into
13110                      * <invlist>.  If /i is in effect, the effective property
13111                      * will have its name be <__NAME_i>.  The design is
13112                      * discussed in commit
13113                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13114                     formatted = Perl_form(aTHX_
13115                                           "%s%.*s%s\n",
13116                                           (FOLD) ? "__" : "",
13117                                           (int)n,
13118                                           RExC_parse,
13119                                           (FOLD) ? "_i" : ""
13120                                 );
13121                     name = savepvn(formatted, strlen(formatted));
13122
13123                     /* Look up the property name, and get its swash and
13124                      * inversion list, if the property is found  */
13125                     if (swash) {
13126                         SvREFCNT_dec_NN(swash);
13127                     }
13128                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13129                                              1, /* binary */
13130                                              0, /* not tr/// */
13131                                              NULL, /* No inversion list */
13132                                              &swash_init_flags
13133                                             );
13134                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13135                         if (swash) {
13136                             SvREFCNT_dec_NN(swash);
13137                             swash = NULL;
13138                         }
13139
13140                         /* Here didn't find it.  It could be a user-defined
13141                          * property that will be available at run-time.  If we
13142                          * accept only compile-time properties, is an error;
13143                          * otherwise add it to the list for run-time look up */
13144                         if (ret_invlist) {
13145                             RExC_parse = e + 1;
13146                             vFAIL2utf8f(
13147                                 "Property '%"UTF8f"' is unknown",
13148                                 UTF8fARG(UTF, n, name));
13149                         }
13150                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13151                                         (value == 'p' ? '+' : '!'),
13152                                         UTF8fARG(UTF, n, name));
13153                         has_user_defined_property = TRUE;
13154
13155                         /* We don't know yet, so have to assume that the
13156                          * property could match something in the Latin1 range,
13157                          * hence something that isn't utf8.  Note that this
13158                          * would cause things in <depends_list> to match
13159                          * inappropriately, except that any \p{}, including
13160                          * this one forces Unicode semantics, which means there
13161                          * is no <depends_list> */
13162                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13163                     }
13164                     else {
13165
13166                         /* Here, did get the swash and its inversion list.  If
13167                          * the swash is from a user-defined property, then this
13168                          * whole character class should be regarded as such */
13169                         if (swash_init_flags
13170                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13171                         {
13172                             has_user_defined_property = TRUE;
13173                         }
13174                         else if
13175                             /* We warn on matching an above-Unicode code point
13176                              * if the match would return true, except don't
13177                              * warn for \p{All}, which has exactly one element
13178                              * = 0 */
13179                             (_invlist_contains_cp(invlist, 0x110000)
13180                                 && (! (_invlist_len(invlist) == 1
13181                                        && *invlist_array(invlist) == 0)))
13182                         {
13183                             warn_super = TRUE;
13184                         }
13185
13186
13187                         /* Invert if asking for the complement */
13188                         if (value == 'P') {
13189                             _invlist_union_complement_2nd(properties,
13190                                                           invlist,
13191                                                           &properties);
13192
13193                             /* The swash can't be used as-is, because we've
13194                              * inverted things; delay removing it to here after
13195                              * have copied its invlist above */
13196                             SvREFCNT_dec_NN(swash);
13197                             swash = NULL;
13198                         }
13199                         else {
13200                             _invlist_union(properties, invlist, &properties);
13201                         }
13202                     }
13203                     Safefree(name);
13204                 }
13205                 RExC_parse = e + 1;
13206                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13207                                                 named */
13208
13209                 /* \p means they want Unicode semantics */
13210                 RExC_uni_semantics = 1;
13211                 }
13212                 break;
13213             case 'n':   value = '\n';                   break;
13214             case 'r':   value = '\r';                   break;
13215             case 't':   value = '\t';                   break;
13216             case 'f':   value = '\f';                   break;
13217             case 'b':   value = '\b';                   break;
13218             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13219             case 'a':   value = '\a';                   break;
13220             case 'o':
13221                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13222                 {
13223                     const char* error_msg;
13224                     bool valid = grok_bslash_o(&RExC_parse,
13225                                                &value,
13226                                                &error_msg,
13227                                                SIZE_ONLY,   /* warnings in pass
13228                                                                1 only */
13229                                                strict,
13230                                                silence_non_portable,
13231                                                UTF);
13232                     if (! valid) {
13233                         vFAIL(error_msg);
13234                     }
13235                 }
13236                 if (PL_encoding && value < 0x100) {
13237                     goto recode_encoding;
13238                 }
13239                 break;
13240             case 'x':
13241                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13242                 {
13243                     const char* error_msg;
13244                     bool valid = grok_bslash_x(&RExC_parse,
13245                                                &value,
13246                                                &error_msg,
13247                                                TRUE, /* Output warnings */
13248                                                strict,
13249                                                silence_non_portable,
13250                                                UTF);
13251                     if (! valid) {
13252                         vFAIL(error_msg);
13253                     }
13254                 }
13255                 if (PL_encoding && value < 0x100)
13256                     goto recode_encoding;
13257                 break;
13258             case 'c':
13259                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13260                 break;
13261             case '0': case '1': case '2': case '3': case '4':
13262             case '5': case '6': case '7':
13263                 {
13264                     /* Take 1-3 octal digits */
13265                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13266                     numlen = (strict) ? 4 : 3;
13267                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13268                     RExC_parse += numlen;
13269                     if (numlen != 3) {
13270                         if (strict) {
13271                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13272                             vFAIL("Need exactly 3 octal digits");
13273                         }
13274                         else if (! SIZE_ONLY /* like \08, \178 */
13275                                  && numlen < 3
13276                                  && RExC_parse < RExC_end
13277                                  && isDIGIT(*RExC_parse)
13278                                  && ckWARN(WARN_REGEXP))
13279                         {
13280                             SAVEFREESV(RExC_rx_sv);
13281                             reg_warn_non_literal_string(
13282                                  RExC_parse + 1,
13283                                  form_short_octal_warning(RExC_parse, numlen));
13284                             (void)ReREFCNT_inc(RExC_rx_sv);
13285                         }
13286                     }
13287                     if (PL_encoding && value < 0x100)
13288                         goto recode_encoding;
13289                     break;
13290                 }
13291             recode_encoding:
13292                 if (! RExC_override_recoding) {
13293                     SV* enc = PL_encoding;
13294                     value = reg_recode((const char)(U8)value, &enc);
13295                     if (!enc) {
13296                         if (strict) {
13297                             vFAIL("Invalid escape in the specified encoding");
13298                         }
13299                         else if (SIZE_ONLY) {
13300                             ckWARNreg(RExC_parse,
13301                                   "Invalid escape in the specified encoding");
13302                         }
13303                     }
13304                     break;
13305                 }
13306             default:
13307                 /* Allow \_ to not give an error */
13308                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13309                     if (strict) {
13310                         vFAIL2("Unrecognized escape \\%c in character class",
13311                                (int)value);
13312                     }
13313                     else {
13314                         SAVEFREESV(RExC_rx_sv);
13315                         ckWARN2reg(RExC_parse,
13316                             "Unrecognized escape \\%c in character class passed through",
13317                             (int)value);
13318                         (void)ReREFCNT_inc(RExC_rx_sv);
13319                     }
13320                 }
13321                 break;
13322             }   /* End of switch on char following backslash */
13323         } /* end of handling backslash escape sequences */
13324 #ifdef EBCDIC
13325         else
13326             literal_endpoint++;
13327 #endif
13328
13329         /* Here, we have the current token in 'value' */
13330
13331         /* What matches in a locale is not known until runtime.  This includes
13332          * what the Posix classes (like \w, [:space:]) match.  Room must be
13333          * reserved (one time per outer bracketed class) to store such classes,
13334          * either if Perl is compiled so that locale nodes always should have
13335          * this space, or if there is such posix class info to be stored.  The
13336          * space will contain a bit for each named class that is to be matched
13337          * against.  This isn't needed for \p{} and pseudo-classes, as they are
13338          * not affected by locale, and hence are dealt with separately */
13339         if (LOC
13340             && ! need_class
13341             && (ANYOF_LOCALE == ANYOF_POSIXL
13342                 || (namedclass > OOB_NAMEDCLASS
13343                     && namedclass < ANYOF_POSIXL_MAX)))
13344         {
13345             need_class = 1;
13346             if (SIZE_ONLY) {
13347                 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13348             }
13349             else {
13350                 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13351             }
13352             ANYOF_POSIXL_ZERO(ret);
13353             ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13354         }
13355
13356         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13357             U8 classnum;
13358
13359             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13360              * literal, as is the character that began the false range, i.e.
13361              * the 'a' in the examples */
13362             if (range) {
13363                 if (!SIZE_ONLY) {
13364                     const int w = (RExC_parse >= rangebegin)
13365                                   ? RExC_parse - rangebegin
13366                                   : 0;
13367                     if (strict) {
13368                         vFAIL2utf8f(
13369                             "False [] range \"%"UTF8f"\"",
13370                             UTF8fARG(UTF, w, rangebegin));
13371                     }
13372                     else {
13373                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13374                         ckWARN2reg(RExC_parse,
13375                             "False [] range \"%"UTF8f"\"",
13376                             UTF8fARG(UTF, w, rangebegin));
13377                         (void)ReREFCNT_inc(RExC_rx_sv);
13378                         cp_list = add_cp_to_invlist(cp_list, '-');
13379                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
13380                     }
13381                 }
13382
13383                 range = 0; /* this was not a true range */
13384                 element_count += 2; /* So counts for three values */
13385             }
13386
13387             classnum = namedclass_to_classnum(namedclass);
13388
13389             if (LOC && namedclass < ANYOF_POSIXL_MAX
13390 #ifndef HAS_ISASCII
13391                 && classnum != _CC_ASCII
13392 #endif
13393 #ifndef HAS_ISBLANK
13394                 && classnum != _CC_BLANK
13395 #endif
13396             ) {
13397                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13398                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13399                                                             ? -1
13400                                                             : 1)))
13401                 {
13402                     posixl_matches_all = TRUE;
13403                     break;
13404                 }
13405                 ANYOF_POSIXL_SET(ret, namedclass);
13406             }
13407             /* XXX After have made all the posix classes known at compile time
13408              * we can move the LOC handling below to above */
13409
13410             if (! SIZE_ONLY) {
13411                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13412                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13413
13414                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
13415                          * /l make a difference in what these match.  There
13416                          * would be problems if these characters had folds
13417                          * other than themselves, as cp_list is subject to
13418                          * folding. */
13419                         if (classnum != _CC_VERTSPACE) {
13420                             assert(   namedclass == ANYOF_HORIZWS
13421                                    || namedclass == ANYOF_NHORIZWS);
13422
13423                             /* It turns out that \h is just a synonym for
13424                              * XPosixBlank */
13425                             classnum = _CC_BLANK;
13426                         }
13427
13428                         _invlist_union_maybe_complement_2nd(
13429                                 cp_list,
13430                                 PL_XPosix_ptrs[classnum],
13431                                 cBOOL(namedclass % 2), /* Complement if odd
13432                                                           (NHORIZWS, NVERTWS)
13433                                                         */
13434                                 &cp_list);
13435                     }
13436                 }
13437                 else if (classnum == _CC_ASCII) {
13438 #ifdef HAS_ISASCII
13439                     if (LOC) {
13440                         ANYOF_POSIXL_SET(ret, namedclass);
13441                     }
13442                     else
13443 #endif  /* Not isascii(); just use the hard-coded definition for it */
13444                     {
13445                         _invlist_union_maybe_complement_2nd(
13446                                 posixes,
13447                                 PL_Posix_ptrs[_CC_ASCII],
13448                                 cBOOL(namedclass % 2), /* Complement if odd
13449                                                           (NASCII) */
13450                                 &posixes);
13451
13452                         /* The code points 128-255 added above will be
13453                          * subtracted out below under /d, so the flag needs to
13454                          * be set */
13455                         if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
13456                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13457                         }
13458                     }
13459                 }
13460                 else {  /* Garden variety class */
13461
13462                     /* The ascii range inversion list */
13463                     SV* ascii_source = PL_Posix_ptrs[classnum];
13464
13465                     /* The full Latin1 range inversion list */
13466                     SV* l1_source = PL_L1Posix_ptrs[classnum];
13467
13468                     /* This code is structured into two major clauses.  The
13469                      * first is for classes whose complete definitions may not
13470                      * already be known.  If not, the Latin1 definition
13471                      * (guaranteed to already known) is used plus code is
13472                      * generated to load the rest at run-time (only if needed).
13473                      * If the complete definition is known, it drops down to
13474                      * the second clause, where the complete definition is
13475                      * known */
13476
13477                     if (classnum < _FIRST_NON_SWASH_CC) {
13478
13479                         /* Here, the class has a swash, which may or not
13480                          * already be loaded */
13481
13482                         /* The name of the property to use to match the full
13483                          * eXtended Unicode range swash for this character
13484                          * class */
13485                         const char *Xname = swash_property_names[classnum];
13486
13487                         /* If returning the inversion list, we can't defer
13488                          * getting this until runtime */
13489                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
13490                             PL_utf8_swash_ptrs[classnum] =
13491                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
13492                                              1, /* binary */
13493                                              0, /* not tr/// */
13494                                              NULL, /* No inversion list */
13495                                              NULL  /* No flags */
13496                                             );
13497                             assert(PL_utf8_swash_ptrs[classnum]);
13498                         }
13499                         if ( !  PL_utf8_swash_ptrs[classnum]) {
13500                             if (namedclass % 2 == 0) { /* A non-complemented
13501                                                           class */
13502                                 /* If not /a matching, there are code points we
13503                                  * don't know at compile time.  Arrange for the
13504                                  * unknown matches to be loaded at run-time, if
13505                                  * needed */
13506                                 if (! AT_LEAST_ASCII_RESTRICTED) {
13507                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13508                                                                  Xname);
13509                                 }
13510                                 if (LOC) {  /* Under locale, set run-time
13511                                                lookup */
13512                                     ANYOF_POSIXL_SET(ret, namedclass);
13513                                 }
13514                                 else {
13515                                     /* Add the current class's code points to
13516                                      * the running total */
13517                                     _invlist_union(posixes,
13518                                                    (AT_LEAST_ASCII_RESTRICTED)
13519                                                         ? ascii_source
13520                                                         : l1_source,
13521                                                    &posixes);
13522                                 }
13523                             }
13524                             else {  /* A complemented class */
13525                                 if (AT_LEAST_ASCII_RESTRICTED) {
13526                                     /* Under /a should match everything above
13527                                      * ASCII, plus the complement of the set's
13528                                      * ASCII matches */
13529                                     _invlist_union_complement_2nd(posixes,
13530                                                                   ascii_source,
13531                                                                   &posixes);
13532                                 }
13533                                 else {
13534                                     /* Arrange for the unknown matches to be
13535                                      * loaded at run-time, if needed */
13536                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13537                                                                  Xname);
13538                                     runtime_posix_matches_above_Unicode = TRUE;
13539                                     if (LOC) {
13540                                         ANYOF_POSIXL_SET(ret, namedclass);
13541                                     }
13542                                     else {
13543
13544                                         /* We want to match everything in
13545                                          * Latin1, except those things that
13546                                          * l1_source matches */
13547                                         SV* scratch_list = NULL;
13548                                         _invlist_subtract(PL_Latin1, l1_source,
13549                                                           &scratch_list);
13550
13551                                         /* Add the list from this class to the
13552                                          * running total */
13553                                         if (! posixes) {
13554                                             posixes = scratch_list;
13555                                         }
13556                                         else {
13557                                             _invlist_union(posixes,
13558                                                            scratch_list,
13559                                                            &posixes);
13560                                             SvREFCNT_dec_NN(scratch_list);
13561                                         }
13562                                         if (DEPENDS_SEMANTICS) {
13563                                             ANYOF_FLAGS(ret)
13564                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
13565                                         }
13566                                     }
13567                                 }
13568                             }
13569                             goto namedclass_done;
13570                         }
13571
13572                         /* Here, there is a swash loaded for the class.  If no
13573                          * inversion list for it yet, get it */
13574                         if (! PL_XPosix_ptrs[classnum]) {
13575                             PL_XPosix_ptrs[classnum]
13576                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13577                         }
13578                     }
13579
13580                     /* Here there is an inversion list already loaded for the
13581                      * entire class */
13582
13583                     if (namedclass % 2 == 0) {  /* A non-complemented class,
13584                                                    like ANYOF_PUNCT */
13585                         if (! LOC) {
13586                             /* For non-locale, just add it to any existing list
13587                              * */
13588                             _invlist_union(posixes,
13589                                            (AT_LEAST_ASCII_RESTRICTED)
13590                                                ? ascii_source
13591                                                : PL_XPosix_ptrs[classnum],
13592                                            &posixes);
13593                         }
13594                         else {  /* Locale */
13595                             SV* scratch_list = NULL;
13596
13597                             /* For above Latin1 code points, we use the full
13598                              * Unicode range */
13599                             _invlist_intersection(PL_AboveLatin1,
13600                                                   PL_XPosix_ptrs[classnum],
13601                                                   &scratch_list);
13602                             /* And set the output to it, adding instead if
13603                              * there already is an output.  Checking if
13604                              * 'posixes' is NULL first saves an extra clone.
13605                              * Its reference count will be decremented at the
13606                              * next union, etc, or if this is the only
13607                              * instance, at the end of the routine */
13608                             if (! posixes) {
13609                                 posixes = scratch_list;
13610                             }
13611                             else {
13612                                 _invlist_union(posixes, scratch_list, &posixes);
13613                                 SvREFCNT_dec_NN(scratch_list);
13614                             }
13615
13616 #ifndef HAS_ISBLANK
13617                             if (namedclass != ANYOF_BLANK) {
13618 #endif
13619                                 /* Set this class in the node for runtime
13620                                  * matching */
13621                                 ANYOF_POSIXL_SET(ret, namedclass);
13622 #ifndef HAS_ISBLANK
13623                             }
13624                             else {
13625                                 /* No isblank(), use the hard-coded ASCII-range
13626                                  * blanks, adding them to the running total. */
13627
13628                                 _invlist_union(posixes, ascii_source, &posixes);
13629                             }
13630 #endif
13631                         }
13632                     }
13633                     else {  /* A complemented class, like ANYOF_NPUNCT */
13634                         if (! LOC) {
13635                             _invlist_union_complement_2nd(
13636                                                 posixes,
13637                                                 (AT_LEAST_ASCII_RESTRICTED)
13638                                                     ? ascii_source
13639                                                     : PL_XPosix_ptrs[classnum],
13640                                                 &posixes);
13641                             /* Under /d, everything in the upper half of the
13642                              * Latin1 range matches this complement */
13643                             if (DEPENDS_SEMANTICS) {
13644                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13645                             }
13646                         }
13647                         else {  /* Locale */
13648                             SV* scratch_list = NULL;
13649                             _invlist_subtract(PL_AboveLatin1,
13650                                               PL_XPosix_ptrs[classnum],
13651                                               &scratch_list);
13652                             if (! posixes) {
13653                                 posixes = scratch_list;
13654                             }
13655                             else {
13656                                 _invlist_union(posixes, scratch_list, &posixes);
13657                                 SvREFCNT_dec_NN(scratch_list);
13658                             }
13659 #ifndef HAS_ISBLANK
13660                             if (namedclass != ANYOF_NBLANK) {
13661 #endif
13662                                 ANYOF_POSIXL_SET(ret, namedclass);
13663 #ifndef HAS_ISBLANK
13664                             }
13665                             else {
13666                                 /* Get the list of all code points in Latin1
13667                                  * that are not ASCII blanks, and add them to
13668                                  * the running total */
13669                                 _invlist_subtract(PL_Latin1, ascii_source,
13670                                                   &scratch_list);
13671                                 _invlist_union(posixes, scratch_list, &posixes);
13672                                 SvREFCNT_dec_NN(scratch_list);
13673                             }
13674 #endif
13675                         }
13676                     }
13677                 }
13678               namedclass_done:
13679                 continue;   /* Go get next character */
13680             }
13681         } /* end of namedclass \blah */
13682
13683         /* Here, we have a single value.  If 'range' is set, it is the ending
13684          * of a range--check its validity.  Later, we will handle each
13685          * individual code point in the range.  If 'range' isn't set, this
13686          * could be the beginning of a range, so check for that by looking
13687          * ahead to see if the next real character to be processed is the range
13688          * indicator--the minus sign */
13689
13690         if (skip_white) {
13691             RExC_parse = regpatws(pRExC_state, RExC_parse,
13692                                 FALSE /* means don't recognize comments */);
13693         }
13694
13695         if (range) {
13696             if (prevvalue > value) /* b-a */ {
13697                 const int w = RExC_parse - rangebegin;
13698                 vFAIL2utf8f(
13699                     "Invalid [] range \"%"UTF8f"\"",
13700                     UTF8fARG(UTF, w, rangebegin));
13701                 range = 0; /* not a valid range */
13702             }
13703         }
13704         else {
13705             prevvalue = value; /* save the beginning of the potential range */
13706             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13707                 && *RExC_parse == '-')
13708             {
13709                 char* next_char_ptr = RExC_parse + 1;
13710                 if (skip_white) {   /* Get the next real char after the '-' */
13711                     next_char_ptr = regpatws(pRExC_state,
13712                                              RExC_parse + 1,
13713                                              FALSE); /* means don't recognize
13714                                                         comments */
13715                 }
13716
13717                 /* If the '-' is at the end of the class (just before the ']',
13718                  * it is a literal minus; otherwise it is a range */
13719                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13720                     RExC_parse = next_char_ptr;
13721
13722                     /* a bad range like \w-, [:word:]- ? */
13723                     if (namedclass > OOB_NAMEDCLASS) {
13724                         if (strict || ckWARN(WARN_REGEXP)) {
13725                             const int w =
13726                                 RExC_parse >= rangebegin ?
13727                                 RExC_parse - rangebegin : 0;
13728                             if (strict) {
13729                                 vFAIL4("False [] range \"%*.*s\"",
13730                                     w, w, rangebegin);
13731                             }
13732                             else {
13733                                 vWARN4(RExC_parse,
13734                                     "False [] range \"%*.*s\"",
13735                                     w, w, rangebegin);
13736                             }
13737                         }
13738                         if (!SIZE_ONLY) {
13739                             cp_list = add_cp_to_invlist(cp_list, '-');
13740                         }
13741                         element_count++;
13742                     } else
13743                         range = 1;      /* yeah, it's a range! */
13744                     continue;   /* but do it the next time */
13745                 }
13746             }
13747         }
13748
13749         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13750          * if not */
13751
13752         /* non-Latin1 code point implies unicode semantics.  Must be set in
13753          * pass1 so is there for the whole of pass 2 */
13754         if (value > 255) {
13755             RExC_uni_semantics = 1;
13756         }
13757
13758         /* Ready to process either the single value, or the completed range.
13759          * For single-valued non-inverted ranges, we consider the possibility
13760          * of multi-char folds.  (We made a conscious decision to not do this
13761          * for the other cases because it can often lead to non-intuitive
13762          * results.  For example, you have the peculiar case that:
13763          *  "s s" =~ /^[^\xDF]+$/i => Y
13764          *  "ss"  =~ /^[^\xDF]+$/i => N
13765          *
13766          * See [perl #89750] */
13767         if (FOLD && allow_multi_folds && value == prevvalue) {
13768             if (value == LATIN_SMALL_LETTER_SHARP_S
13769                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13770                                                         value)))
13771             {
13772                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13773
13774                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13775                 STRLEN foldlen;
13776
13777                 UV folded = _to_uni_fold_flags(
13778                                 value,
13779                                 foldbuf,
13780                                 &foldlen,
13781                                 FOLD_FLAGS_FULL
13782                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13783                                             : (ASCII_FOLD_RESTRICTED)
13784                                               ? FOLD_FLAGS_NOMIX_ASCII
13785                                               : 0)
13786                                 );
13787
13788                 /* Here, <folded> should be the first character of the
13789                  * multi-char fold of <value>, with <foldbuf> containing the
13790                  * whole thing.  But, if this fold is not allowed (because of
13791                  * the flags), <fold> will be the same as <value>, and should
13792                  * be processed like any other character, so skip the special
13793                  * handling */
13794                 if (folded != value) {
13795
13796                     /* Skip if we are recursed, currently parsing the class
13797                      * again.  Otherwise add this character to the list of
13798                      * multi-char folds. */
13799                     if (! RExC_in_multi_char_class) {
13800                         AV** this_array_ptr;
13801                         AV* this_array;
13802                         STRLEN cp_count = utf8_length(foldbuf,
13803                                                       foldbuf + foldlen);
13804                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13805
13806                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13807
13808
13809                         if (! multi_char_matches) {
13810                             multi_char_matches = newAV();
13811                         }
13812
13813                         /* <multi_char_matches> is actually an array of arrays.
13814                          * There will be one or two top-level elements: [2],
13815                          * and/or [3].  The [2] element is an array, each
13816                          * element thereof is a character which folds to TWO
13817                          * characters; [3] is for folds to THREE characters.
13818                          * (Unicode guarantees a maximum of 3 characters in any
13819                          * fold.)  When we rewrite the character class below,
13820                          * we will do so such that the longest folds are
13821                          * written first, so that it prefers the longest
13822                          * matching strings first.  This is done even if it
13823                          * turns out that any quantifier is non-greedy, out of
13824                          * programmer laziness.  Tom Christiansen has agreed
13825                          * that this is ok.  This makes the test for the
13826                          * ligature 'ffi' come before the test for 'ff' */
13827                         if (av_exists(multi_char_matches, cp_count)) {
13828                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13829                                                              cp_count, FALSE);
13830                             this_array = *this_array_ptr;
13831                         }
13832                         else {
13833                             this_array = newAV();
13834                             av_store(multi_char_matches, cp_count,
13835                                      (SV*) this_array);
13836                         }
13837                         av_push(this_array, multi_fold);
13838                     }
13839
13840                     /* This element should not be processed further in this
13841                      * class */
13842                     element_count--;
13843                     value = save_value;
13844                     prevvalue = save_prevvalue;
13845                     continue;
13846                 }
13847             }
13848         }
13849
13850         /* Deal with this element of the class */
13851         if (! SIZE_ONLY) {
13852 #ifndef EBCDIC
13853             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13854 #else
13855             SV* this_range = _new_invlist(1);
13856             _append_range_to_invlist(this_range, prevvalue, value);
13857
13858             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13859              * If this range was specified using something like 'i-j', we want
13860              * to include only the 'i' and the 'j', and not anything in
13861              * between, so exclude non-ASCII, non-alphabetics from it.
13862              * However, if the range was specified with something like
13863              * [\x89-\x91] or [\x89-j], all code points within it should be
13864              * included.  literal_endpoint==2 means both ends of the range used
13865              * a literal character, not \x{foo} */
13866             if (literal_endpoint == 2
13867                 && ((prevvalue >= 'a' && value <= 'z')
13868                     || (prevvalue >= 'A' && value <= 'Z')))
13869             {
13870                 _invlist_intersection(this_range, PL_ASCII,
13871                                       &this_range);
13872                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13873                                       &this_range);
13874             }
13875             _invlist_union(cp_list, this_range, &cp_list);
13876             literal_endpoint = 0;
13877 #endif
13878         }
13879
13880         range = 0; /* this range (if it was one) is done now */
13881     } /* End of loop through all the text within the brackets */
13882
13883     /* If anything in the class expands to more than one character, we have to
13884      * deal with them by building up a substitute parse string, and recursively
13885      * calling reg() on it, instead of proceeding */
13886     if (multi_char_matches) {
13887         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13888         I32 cp_count;
13889         STRLEN len;
13890         char *save_end = RExC_end;
13891         char *save_parse = RExC_parse;
13892         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13893                                        a "|" */
13894         I32 reg_flags;
13895
13896         assert(! invert);
13897 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13898            because too confusing */
13899         if (invert) {
13900             sv_catpv(substitute_parse, "(?:");
13901         }
13902 #endif
13903
13904         /* Look at the longest folds first */
13905         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13906
13907             if (av_exists(multi_char_matches, cp_count)) {
13908                 AV** this_array_ptr;
13909                 SV* this_sequence;
13910
13911                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13912                                                  cp_count, FALSE);
13913                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13914                                                                 &PL_sv_undef)
13915                 {
13916                     if (! first_time) {
13917                         sv_catpv(substitute_parse, "|");
13918                     }
13919                     first_time = FALSE;
13920
13921                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13922                 }
13923             }
13924         }
13925
13926         /* If the character class contains anything else besides these
13927          * multi-character folds, have to include it in recursive parsing */
13928         if (element_count) {
13929             sv_catpv(substitute_parse, "|[");
13930             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13931             sv_catpv(substitute_parse, "]");
13932         }
13933
13934         sv_catpv(substitute_parse, ")");
13935 #if 0
13936         if (invert) {
13937             /* This is a way to get the parse to skip forward a whole named
13938              * sequence instead of matching the 2nd character when it fails the
13939              * first */
13940             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13941         }
13942 #endif
13943
13944         RExC_parse = SvPV(substitute_parse, len);
13945         RExC_end = RExC_parse + len;
13946         RExC_in_multi_char_class = 1;
13947         RExC_emit = (regnode *)orig_emit;
13948
13949         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13950
13951         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13952
13953         RExC_parse = save_parse;
13954         RExC_end = save_end;
13955         RExC_in_multi_char_class = 0;
13956         SvREFCNT_dec_NN(multi_char_matches);
13957         return ret;
13958     }
13959
13960     /* Here, we've gone through the entire class and dealt with multi-char
13961      * folds.  We are now in a position that we can do some checks to see if we
13962      * can optimize this ANYOF node into a simpler one, even in Pass 1.
13963      * Currently we only do two checks:
13964      * 1) is in the unlikely event that the user has specified both, eg. \w and
13965      *    \W under /l, then the class matches everything.  (This optimization
13966      *    is done only to make the optimizer code run later work.)
13967      * 2) if the character class contains only a single element (including a
13968      *    single range), we see if there is an equivalent node for it.
13969      * Other checks are possible */
13970     if (! ret_invlist   /* Can't optimize if returning the constructed
13971                            inversion list */
13972         && (UNLIKELY(posixl_matches_all) || element_count == 1))
13973     {
13974         U8 op = END;
13975         U8 arg = 0;
13976
13977         if (UNLIKELY(posixl_matches_all)) {
13978             op = SANY;
13979         }
13980         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13981                                                    \w or [:digit:] or \p{foo}
13982                                                  */
13983
13984             /* All named classes are mapped into POSIXish nodes, with its FLAG
13985              * argument giving which class it is */
13986             switch ((I32)namedclass) {
13987                 case ANYOF_UNIPROP:
13988                     break;
13989
13990                 /* These don't depend on the charset modifiers.  They always
13991                  * match under /u rules */
13992                 case ANYOF_NHORIZWS:
13993                 case ANYOF_HORIZWS:
13994                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13995                     /* FALLTHROUGH */
13996
13997                 case ANYOF_NVERTWS:
13998                 case ANYOF_VERTWS:
13999                     op = POSIXU;
14000                     goto join_posix;
14001
14002                 /* The actual POSIXish node for all the rest depends on the
14003                  * charset modifier.  The ones in the first set depend only on
14004                  * ASCII or, if available on this platform, locale */
14005                 case ANYOF_ASCII:
14006                 case ANYOF_NASCII:
14007 #ifdef HAS_ISASCII
14008                     op = (LOC) ? POSIXL : POSIXA;
14009 #else
14010                     op = POSIXA;
14011 #endif
14012                     goto join_posix;
14013
14014                 case ANYOF_NCASED:
14015                 case ANYOF_LOWER:
14016                 case ANYOF_NLOWER:
14017                 case ANYOF_UPPER:
14018                 case ANYOF_NUPPER:
14019                     /* under /a could be alpha */
14020                     if (FOLD) {
14021                         if (ASCII_RESTRICTED) {
14022                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14023                         }
14024                         else if (! LOC) {
14025                             break;
14026                         }
14027                     }
14028                     /* FALLTHROUGH */
14029
14030                 /* The rest have more possibilities depending on the charset.
14031                  * We take advantage of the enum ordering of the charset
14032                  * modifiers to get the exact node type, */
14033                 default:
14034                     op = POSIXD + get_regex_charset(RExC_flags);
14035                     if (op > POSIXA) { /* /aa is same as /a */
14036                         op = POSIXA;
14037                     }
14038 #ifndef HAS_ISBLANK
14039                     if (op == POSIXL
14040                         && (namedclass == ANYOF_BLANK
14041                             || namedclass == ANYOF_NBLANK))
14042                     {
14043                         op = POSIXA;
14044                     }
14045 #endif
14046
14047                 join_posix:
14048                     /* The odd numbered ones are the complements of the
14049                      * next-lower even number one */
14050                     if (namedclass % 2 == 1) {
14051                         invert = ! invert;
14052                         namedclass--;
14053                     }
14054                     arg = namedclass_to_classnum(namedclass);
14055                     break;
14056             }
14057         }
14058         else if (value == prevvalue) {
14059
14060             /* Here, the class consists of just a single code point */
14061
14062             if (invert) {
14063                 if (! LOC && value == '\n') {
14064                     op = REG_ANY; /* Optimize [^\n] */
14065                     *flagp |= HASWIDTH|SIMPLE;
14066                     RExC_naughty++;
14067                 }
14068             }
14069             else if (value < 256 || UTF) {
14070
14071                 /* Optimize a single value into an EXACTish node, but not if it
14072                  * would require converting the pattern to UTF-8. */
14073                 op = compute_EXACTish(pRExC_state);
14074             }
14075         } /* Otherwise is a range */
14076         else if (! LOC) {   /* locale could vary these */
14077             if (prevvalue == '0') {
14078                 if (value == '9') {
14079                     arg = _CC_DIGIT;
14080                     op = POSIXA;
14081                 }
14082             }
14083         }
14084
14085         /* Here, we have changed <op> away from its initial value iff we found
14086          * an optimization */
14087         if (op != END) {
14088
14089             /* Throw away this ANYOF regnode, and emit the calculated one,
14090              * which should correspond to the beginning, not current, state of
14091              * the parse */
14092             const char * cur_parse = RExC_parse;
14093             RExC_parse = (char *)orig_parse;
14094             if ( SIZE_ONLY) {
14095                 if (! LOC) {
14096
14097                     /* To get locale nodes to not use the full ANYOF size would
14098                      * require moving the code above that writes the portions
14099                      * of it that aren't in other nodes to after this point.
14100                      * e.g.  ANYOF_POSIXL_SET */
14101                     RExC_size = orig_size;
14102                 }
14103             }
14104             else {
14105                 RExC_emit = (regnode *)orig_emit;
14106                 if (PL_regkind[op] == POSIXD) {
14107                     if (invert) {
14108                         op += NPOSIXD - POSIXD;
14109                     }
14110                 }
14111             }
14112
14113             ret = reg_node(pRExC_state, op);
14114
14115             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14116                 if (! SIZE_ONLY) {
14117                     FLAGS(ret) = arg;
14118                 }
14119                 *flagp |= HASWIDTH|SIMPLE;
14120             }
14121             else if (PL_regkind[op] == EXACT) {
14122                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14123             }
14124
14125             RExC_parse = (char *) cur_parse;
14126
14127             SvREFCNT_dec(posixes);
14128             SvREFCNT_dec(cp_list);
14129             return ret;
14130         }
14131     }
14132
14133     if (SIZE_ONLY)
14134         return ret;
14135     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14136
14137     /* If folding, we calculate all characters that could fold to or from the
14138      * ones already on the list */
14139     if (FOLD && cp_list) {
14140         UV start, end;  /* End points of code point ranges */
14141
14142         SV* fold_intersection = NULL;
14143
14144         /* If the highest code point is within Latin1, we can use the
14145          * compiled-in Alphas list, and not have to go out to disk.  This
14146          * yields two false positives, the masculine and feminine ordinal
14147          * indicators, which are weeded out below using the
14148          * IS_IN_SOME_FOLD_L1() macro */
14149         if (invlist_highest(cp_list) < 256) {
14150             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
14151                                                            &fold_intersection);
14152         }
14153         else {
14154
14155             /* Here, there are non-Latin1 code points, so we will have to go
14156              * fetch the list of all the characters that participate in folds
14157              */
14158             if (! PL_utf8_foldable) {
14159                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14160                                        &PL_sv_undef, 1, 0);
14161                 PL_utf8_foldable = _get_swash_invlist(swash);
14162                 SvREFCNT_dec_NN(swash);
14163             }
14164
14165             /* This is a hash that for a particular fold gives all characters
14166              * that are involved in it */
14167             if (! PL_utf8_foldclosures) {
14168
14169                 /* If we were unable to find any folds, then we likely won't be
14170                  * able to find the closures.  So just create an empty list.
14171                  * Folding will effectively be restricted to the non-Unicode
14172                  * rules hard-coded into Perl.  (This case happens legitimately
14173                  * during compilation of Perl itself before the Unicode tables
14174                  * are generated) */
14175                 if (_invlist_len(PL_utf8_foldable) == 0) {
14176                     PL_utf8_foldclosures = newHV();
14177                 }
14178                 else {
14179                     /* If the folds haven't been read in, call a fold function
14180                      * to force that */
14181                     if (! PL_utf8_tofold) {
14182                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14183
14184                         /* This string is just a short named one above \xff */
14185                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14186                         assert(PL_utf8_tofold); /* Verify that worked */
14187                     }
14188                     PL_utf8_foldclosures =
14189                                     _swash_inversion_hash(PL_utf8_tofold);
14190                 }
14191             }
14192
14193             /* Only the characters in this class that participate in folds need
14194              * be checked.  Get the intersection of this class and all the
14195              * possible characters that are foldable.  This can quickly narrow
14196              * down a large class */
14197             _invlist_intersection(PL_utf8_foldable, cp_list,
14198                                   &fold_intersection);
14199         }
14200
14201         /* Now look at the foldable characters in this class individually */
14202         invlist_iterinit(fold_intersection);
14203         while (invlist_iternext(fold_intersection, &start, &end)) {
14204             UV j;
14205
14206             /* Locale folding for Latin1 characters is deferred until runtime */
14207             if (LOC && start < 256) {
14208                 start = 256;
14209             }
14210
14211             /* Look at every character in the range */
14212             for (j = start; j <= end; j++) {
14213
14214                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14215                 STRLEN foldlen;
14216                 SV** listp;
14217
14218                 if (j < 256) {
14219
14220                     /* We have the latin1 folding rules hard-coded here so that
14221                      * an innocent-looking character class, like /[ks]/i won't
14222                      * have to go out to disk to find the possible matches.
14223                      * XXX It would be better to generate these via regen, in
14224                      * case a new version of the Unicode standard adds new
14225                      * mappings, though that is not really likely, and may be
14226                      * caught by the default: case of the switch below. */
14227
14228                     if (IS_IN_SOME_FOLD_L1(j)) {
14229
14230                         /* ASCII is always matched; non-ASCII is matched only
14231                          * under Unicode rules */
14232                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14233                             cp_list =
14234                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14235                         }
14236                         else {
14237                             depends_list =
14238                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14239                         }
14240                     }
14241
14242                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14243                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14244                     {
14245                         /* Certain Latin1 characters have matches outside
14246                          * Latin1.  To get here, <j> is one of those
14247                          * characters.   None of these matches is valid for
14248                          * ASCII characters under /aa, which is why the 'if'
14249                          * just above excludes those.  These matches only
14250                          * happen when the target string is utf8.  The code
14251                          * below adds the single fold closures for <j> to the
14252                          * inversion list. */
14253                         switch (j) {
14254                             case 'k':
14255                             case 'K':
14256                                 cp_list =
14257                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
14258                                 break;
14259                             case 's':
14260                             case 'S':
14261                                 cp_list = add_cp_to_invlist(cp_list,
14262                                                     LATIN_SMALL_LETTER_LONG_S);
14263                                 break;
14264                             case MICRO_SIGN:
14265                                 cp_list = add_cp_to_invlist(cp_list,
14266                                                     GREEK_CAPITAL_LETTER_MU);
14267                                 cp_list = add_cp_to_invlist(cp_list,
14268                                                     GREEK_SMALL_LETTER_MU);
14269                                 break;
14270                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14271                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14272                                 cp_list =
14273                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14274                                 break;
14275                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14276                                 cp_list = add_cp_to_invlist(cp_list,
14277                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14278                                 break;
14279                             case LATIN_SMALL_LETTER_SHARP_S:
14280                                 cp_list = add_cp_to_invlist(cp_list,
14281                                                 LATIN_CAPITAL_LETTER_SHARP_S);
14282                                 break;
14283                             case 'F': case 'f':
14284                             case 'I': case 'i':
14285                             case 'L': case 'l':
14286                             case 'T': case 't':
14287                             case 'A': case 'a':
14288                             case 'H': case 'h':
14289                             case 'J': case 'j':
14290                             case 'N': case 'n':
14291                             case 'W': case 'w':
14292                             case 'Y': case 'y':
14293                                 /* These all are targets of multi-character
14294                                  * folds from code points that require UTF8 to
14295                                  * express, so they can't match unless the
14296                                  * target string is in UTF-8, so no action here
14297                                  * is necessary, as regexec.c properly handles
14298                                  * the general case for UTF-8 matching and
14299                                  * multi-char folds */
14300                                 break;
14301                             default:
14302                                 /* Use deprecated warning to increase the
14303                                  * chances of this being output */
14304                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14305                                 break;
14306                         }
14307                     }
14308                     continue;
14309                 }
14310
14311                 /* Here is an above Latin1 character.  We don't have the rules
14312                  * hard-coded for it.  First, get its fold.  This is the simple
14313                  * fold, as the multi-character folds have been handled earlier
14314                  * and separated out */
14315                 _to_uni_fold_flags(j, foldbuf, &foldlen,
14316                                                ((LOC)
14317                                                ? FOLD_FLAGS_LOCALE
14318                                                : (ASCII_FOLD_RESTRICTED)
14319                                                   ? FOLD_FLAGS_NOMIX_ASCII
14320                                                   : 0));
14321
14322                 /* Single character fold of above Latin1.  Add everything in
14323                  * its fold closure to the list that this node should match.
14324                  * The fold closures data structure is a hash with the keys
14325                  * being the UTF-8 of every character that is folded to, like
14326                  * 'k', and the values each an array of all code points that
14327                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14328                  * Multi-character folds are not included */
14329                 if ((listp = hv_fetch(PL_utf8_foldclosures,
14330                                       (char *) foldbuf, foldlen, FALSE)))
14331                 {
14332                     AV* list = (AV*) *listp;
14333                     IV k;
14334                     for (k = 0; k <= av_len(list); k++) {
14335                         SV** c_p = av_fetch(list, k, FALSE);
14336                         UV c;
14337                         if (c_p == NULL) {
14338                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14339                         }
14340                         c = SvUV(*c_p);
14341
14342                         /* /aa doesn't allow folds between ASCII and non-; /l
14343                          * doesn't allow them between above and below 256 */
14344                         if ((ASCII_FOLD_RESTRICTED
14345                                   && (isASCII(c) != isASCII(j)))
14346                             || (LOC && c < 256)) {
14347                             continue;
14348                         }
14349
14350                         /* Folds involving non-ascii Latin1 characters
14351                          * under /d are added to a separate list */
14352                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14353                         {
14354                             cp_list = add_cp_to_invlist(cp_list, c);
14355                         }
14356                         else {
14357                           depends_list = add_cp_to_invlist(depends_list, c);
14358                         }
14359                     }
14360                 }
14361             }
14362         }
14363         SvREFCNT_dec_NN(fold_intersection);
14364     }
14365
14366     /* And combine the result (if any) with any inversion list from posix
14367      * classes.  The lists are kept separate up to now because we don't want to
14368      * fold the classes (folding of those is automatically handled by the swash
14369      * fetching code) */
14370     if (posixes) {
14371         if (! DEPENDS_SEMANTICS) {
14372             if (cp_list) {
14373                 _invlist_union(cp_list, posixes, &cp_list);
14374                 SvREFCNT_dec_NN(posixes);
14375             }
14376             else {
14377                 cp_list = posixes;
14378             }
14379         }
14380         else {
14381             /* Under /d, we put into a separate list the Latin1 things that
14382              * match only when the target string is utf8 */
14383             SV* nonascii_but_latin1_properties = NULL;
14384             _invlist_intersection(posixes, PL_UpperLatin1,
14385                                   &nonascii_but_latin1_properties);
14386             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14387                               &posixes);
14388             if (cp_list) {
14389                 _invlist_union(cp_list, posixes, &cp_list);
14390                 SvREFCNT_dec_NN(posixes);
14391             }
14392             else {
14393                 cp_list = posixes;
14394             }
14395
14396             if (depends_list) {
14397                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14398                                &depends_list);
14399                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14400             }
14401             else {
14402                 depends_list = nonascii_but_latin1_properties;
14403             }
14404         }
14405     }
14406
14407     /* And combine the result (if any) with any inversion list from properties.
14408      * The lists are kept separate up to now so that we can distinguish the two
14409      * in regards to matching above-Unicode.  A run-time warning is generated
14410      * if a Unicode property is matched against a non-Unicode code point. But,
14411      * we allow user-defined properties to match anything, without any warning,
14412      * and we also suppress the warning if there is a portion of the character
14413      * class that isn't a Unicode property, and which matches above Unicode, \W
14414      * or [\x{110000}] for example.
14415      * (Note that in this case, unlike the Posix one above, there is no
14416      * <depends_list>, because having a Unicode property forces Unicode
14417      * semantics */
14418     if (properties) {
14419         if (cp_list) {
14420
14421             /* If it matters to the final outcome, see if a non-property
14422              * component of the class matches above Unicode.  If so, the
14423              * warning gets suppressed.  This is true even if just a single
14424              * such code point is specified, as though not strictly correct if
14425              * another such code point is matched against, the fact that they
14426              * are using above-Unicode code points indicates they should know
14427              * the issues involved */
14428             if (warn_super) {
14429                 bool non_prop_matches_above_Unicode =
14430                             runtime_posix_matches_above_Unicode
14431                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14432                 if (invert) {
14433                     non_prop_matches_above_Unicode =
14434                                             !  non_prop_matches_above_Unicode;
14435                 }
14436                 warn_super = ! non_prop_matches_above_Unicode;
14437             }
14438
14439             _invlist_union(properties, cp_list, &cp_list);
14440             SvREFCNT_dec_NN(properties);
14441         }
14442         else {
14443             cp_list = properties;
14444         }
14445
14446         if (warn_super) {
14447             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14448         }
14449     }
14450
14451     /* Here, we have calculated what code points should be in the character
14452      * class.
14453      *
14454      * Now we can see about various optimizations.  Fold calculation (which we
14455      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14456      * would invert to include K, which under /i would match k, which it
14457      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14458      * folded until runtime */
14459
14460     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14461      * at compile time.  Besides not inverting folded locale now, we can't
14462      * invert if there are things such as \w, which aren't known until runtime
14463      * */
14464     if (invert
14465         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14466         && ! depends_list
14467         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14468     {
14469         _invlist_invert(cp_list);
14470
14471         /* Any swash can't be used as-is, because we've inverted things */
14472         if (swash) {
14473             SvREFCNT_dec_NN(swash);
14474             swash = NULL;
14475         }
14476
14477         /* Clear the invert flag since have just done it here */
14478         invert = FALSE;
14479     }
14480
14481     if (ret_invlist) {
14482         *ret_invlist = cp_list;
14483         SvREFCNT_dec(swash);
14484
14485         /* Discard the generated node */
14486         if (SIZE_ONLY) {
14487             RExC_size = orig_size;
14488         }
14489         else {
14490             RExC_emit = orig_emit;
14491         }
14492         return orig_emit;
14493     }
14494
14495     /* If we didn't do folding, it's because some information isn't available
14496      * until runtime; set the run-time fold flag for these.  (We don't have to
14497      * worry about properties folding, as that is taken care of by the swash
14498      * fetching) */
14499     if (FOLD && LOC)
14500     {
14501        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14502     }
14503
14504     /* Some character classes are equivalent to other nodes.  Such nodes take
14505      * up less room and generally fewer operations to execute than ANYOF nodes.
14506      * Above, we checked for and optimized into some such equivalents for
14507      * certain common classes that are easy to test.  Getting to this point in
14508      * the code means that the class didn't get optimized there.  Since this
14509      * code is only executed in Pass 2, it is too late to save space--it has
14510      * been allocated in Pass 1, and currently isn't given back.  But turning
14511      * things into an EXACTish node can allow the optimizer to join it to any
14512      * adjacent such nodes.  And if the class is equivalent to things like /./,
14513      * expensive run-time swashes can be avoided.  Now that we have more
14514      * complete information, we can find things necessarily missed by the
14515      * earlier code.  I (khw) am not sure how much to look for here.  It would
14516      * be easy, but perhaps too slow, to check any candidates against all the
14517      * node types they could possibly match using _invlistEQ(). */
14518
14519     if (cp_list
14520         && ! invert
14521         && ! depends_list
14522         && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14523         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14524
14525            /* We don't optimize if we are supposed to make sure all non-Unicode
14526             * code points raise a warning, as only ANYOF nodes have this check.
14527             * */
14528         && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14529     {
14530         UV start, end;
14531         U8 op = END;  /* The optimzation node-type */
14532         const char * cur_parse= RExC_parse;
14533
14534         invlist_iterinit(cp_list);
14535         if (! invlist_iternext(cp_list, &start, &end)) {
14536
14537             /* Here, the list is empty.  This happens, for example, when a
14538              * Unicode property is the only thing in the character class, and
14539              * it doesn't match anything.  (perluniprops.pod notes such
14540              * properties) */
14541             op = OPFAIL;
14542             *flagp |= HASWIDTH|SIMPLE;
14543         }
14544         else if (start == end) {    /* The range is a single code point */
14545             if (! invlist_iternext(cp_list, &start, &end)
14546
14547                     /* Don't do this optimization if it would require changing
14548                      * the pattern to UTF-8 */
14549                 && (start < 256 || UTF))
14550             {
14551                 /* Here, the list contains a single code point.  Can optimize
14552                  * into an EXACT node */
14553
14554                 value = start;
14555
14556                 if (! FOLD) {
14557                     op = EXACT;
14558                 }
14559                 else if (LOC) {
14560
14561                     /* A locale node under folding with one code point can be
14562                      * an EXACTFL, as its fold won't be calculated until
14563                      * runtime */
14564                     op = EXACTFL;
14565                 }
14566                 else {
14567
14568                     /* Here, we are generally folding, but there is only one
14569                      * code point to match.  If we have to, we use an EXACT
14570                      * node, but it would be better for joining with adjacent
14571                      * nodes in the optimization pass if we used the same
14572                      * EXACTFish node that any such are likely to be.  We can
14573                      * do this iff the code point doesn't participate in any
14574                      * folds.  For example, an EXACTF of a colon is the same as
14575                      * an EXACT one, since nothing folds to or from a colon. */
14576                     if (value < 256) {
14577                         if (IS_IN_SOME_FOLD_L1(value)) {
14578                             op = EXACT;
14579                         }
14580                     }
14581                     else {
14582                         if (! PL_utf8_foldable) {
14583                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14584                                                 &PL_sv_undef, 1, 0);
14585                             PL_utf8_foldable = _get_swash_invlist(swash);
14586                             SvREFCNT_dec_NN(swash);
14587                         }
14588                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14589                             op = EXACT;
14590                         }
14591                     }
14592
14593                     /* If we haven't found the node type, above, it means we
14594                      * can use the prevailing one */
14595                     if (op == END) {
14596                         op = compute_EXACTish(pRExC_state);
14597                     }
14598                 }
14599             }
14600         }
14601         else if (start == 0) {
14602             if (end == UV_MAX) {
14603                 op = SANY;
14604                 *flagp |= HASWIDTH|SIMPLE;
14605                 RExC_naughty++;
14606             }
14607             else if (end == '\n' - 1
14608                     && invlist_iternext(cp_list, &start, &end)
14609                     && start == '\n' + 1 && end == UV_MAX)
14610             {
14611                 op = REG_ANY;
14612                 *flagp |= HASWIDTH|SIMPLE;
14613                 RExC_naughty++;
14614             }
14615         }
14616         invlist_iterfinish(cp_list);
14617
14618         if (op != END) {
14619             RExC_parse = (char *)orig_parse;
14620             RExC_emit = (regnode *)orig_emit;
14621
14622             ret = reg_node(pRExC_state, op);
14623
14624             RExC_parse = (char *)cur_parse;
14625
14626             if (PL_regkind[op] == EXACT) {
14627                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14628             }
14629
14630             SvREFCNT_dec_NN(cp_list);
14631             return ret;
14632         }
14633     }
14634
14635     /* Here, <cp_list> contains all the code points we can determine at
14636      * compile time that match under all conditions.  Go through it, and
14637      * for things that belong in the bitmap, put them there, and delete from
14638      * <cp_list>.  While we are at it, see if everything above 255 is in the
14639      * list, and if so, set a flag to speed up execution */
14640
14641     populate_ANYOF_from_invlist(ret, &cp_list);
14642
14643     if (invert) {
14644         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14645     }
14646
14647     /* Here, the bitmap has been populated with all the Latin1 code points that
14648      * always match.  Can now add to the overall list those that match only
14649      * when the target string is UTF-8 (<depends_list>). */
14650     if (depends_list) {
14651         if (cp_list) {
14652             _invlist_union(cp_list, depends_list, &cp_list);
14653             SvREFCNT_dec_NN(depends_list);
14654         }
14655         else {
14656             cp_list = depends_list;
14657         }
14658     }
14659
14660     /* If there is a swash and more than one element, we can't use the swash in
14661      * the optimization below. */
14662     if (swash && element_count > 1) {
14663         SvREFCNT_dec_NN(swash);
14664         swash = NULL;
14665     }
14666
14667     set_ANYOF_arg(pRExC_state, ret, cp_list,
14668                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14669                    ? listsv : NULL,
14670                   swash, has_user_defined_property);
14671
14672     *flagp |= HASWIDTH|SIMPLE;
14673     return ret;
14674 }
14675
14676 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14677
14678 STATIC void
14679 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14680                 regnode* const node,
14681                 SV* const cp_list,
14682                 SV* const runtime_defns,
14683                 SV* const swash,
14684                 const bool has_user_defined_property)
14685 {
14686     /* Sets the arg field of an ANYOF-type node 'node', using information about
14687      * the node passed-in.  If there is nothing outside the node's bitmap, the
14688      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14689      * the count returned by add_data(), having allocated and stored an array,
14690      * av, that that count references, as follows:
14691      *  av[0] stores the character class description in its textual form.
14692      *        This is used later (regexec.c:Perl_regclass_swash()) to
14693      *        initialize the appropriate swash, and is also useful for dumping
14694      *        the regnode.  This is set to &PL_sv_undef if the textual
14695      *        description is not needed at run-time (as happens if the other
14696      *        elements completely define the class)
14697      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14698      *        computed from av[0].  But if no further computation need be done,
14699      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14700      *  av[2] stores the cp_list inversion list for use in addition or instead
14701      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14702      *        (Otherwise everything needed is already in av[0] and av[1])
14703      *  av[3] is set if any component of the class is from a user-defined
14704      *        property; used only if av[2] exists */
14705
14706     UV n;
14707
14708     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14709
14710     if (! cp_list && ! runtime_defns) {
14711         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14712     }
14713     else {
14714         AV * const av = newAV();
14715         SV *rv;
14716
14717         av_store(av, 0, (runtime_defns)
14718                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14719         if (swash) {
14720             av_store(av, 1, swash);
14721             SvREFCNT_dec_NN(cp_list);
14722         }
14723         else {
14724             av_store(av, 1, &PL_sv_undef);
14725             if (cp_list) {
14726                 av_store(av, 2, cp_list);
14727                 av_store(av, 3, newSVuv(has_user_defined_property));
14728             }
14729         }
14730
14731         rv = newRV_noinc(MUTABLE_SV(av));
14732         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14733         RExC_rxi->data->data[n] = (void*)rv;
14734         ARG_SET(node, n);
14735     }
14736 }
14737
14738
14739 /* reg_skipcomment()
14740
14741    Absorbs an /x style # comments from the input stream.
14742    Returns true if there is more text remaining in the stream.
14743    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14744    terminates the pattern without including a newline.
14745
14746    Note its the callers responsibility to ensure that we are
14747    actually in /x mode
14748
14749 */
14750
14751 STATIC bool
14752 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14753 {
14754     bool ended = 0;
14755
14756     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14757
14758     while (RExC_parse < RExC_end)
14759         if (*RExC_parse++ == '\n') {
14760             ended = 1;
14761             break;
14762         }
14763     if (!ended) {
14764         /* we ran off the end of the pattern without ending
14765            the comment, so we have to add an \n when wrapping */
14766         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14767         return 0;
14768     } else
14769         return 1;
14770 }
14771
14772 /* nextchar()
14773
14774    Advances the parse position, and optionally absorbs
14775    "whitespace" from the inputstream.
14776
14777    Without /x "whitespace" means (?#...) style comments only,
14778    with /x this means (?#...) and # comments and whitespace proper.
14779
14780    Returns the RExC_parse point from BEFORE the scan occurs.
14781
14782    This is the /x friendly way of saying RExC_parse++.
14783 */
14784
14785 STATIC char*
14786 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14787 {
14788     char* const retval = RExC_parse++;
14789
14790     PERL_ARGS_ASSERT_NEXTCHAR;
14791
14792     for (;;) {
14793         if (RExC_end - RExC_parse >= 3
14794             && *RExC_parse == '('
14795             && RExC_parse[1] == '?'
14796             && RExC_parse[2] == '#')
14797         {
14798             while (*RExC_parse != ')') {
14799                 if (RExC_parse == RExC_end)
14800                     FAIL("Sequence (?#... not terminated");
14801                 RExC_parse++;
14802             }
14803             RExC_parse++;
14804             continue;
14805         }
14806         if (RExC_flags & RXf_PMf_EXTENDED) {
14807             if (isSPACE(*RExC_parse)) {
14808                 RExC_parse++;
14809                 continue;
14810             }
14811             else if (*RExC_parse == '#') {
14812                 if ( reg_skipcomment( pRExC_state ) )
14813                     continue;
14814             }
14815         }
14816         return retval;
14817     }
14818 }
14819
14820 /*
14821 - reg_node - emit a node
14822 */
14823 STATIC regnode *                        /* Location. */
14824 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14825 {
14826     dVAR;
14827     regnode *ptr;
14828     regnode * const ret = RExC_emit;
14829     GET_RE_DEBUG_FLAGS_DECL;
14830
14831     PERL_ARGS_ASSERT_REG_NODE;
14832
14833     if (SIZE_ONLY) {
14834         SIZE_ALIGN(RExC_size);
14835         RExC_size += 1;
14836         return(ret);
14837     }
14838     if (RExC_emit >= RExC_emit_bound)
14839         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14840                    op, RExC_emit, RExC_emit_bound);
14841
14842     NODE_ALIGN_FILL(ret);
14843     ptr = ret;
14844     FILL_ADVANCE_NODE(ptr, op);
14845 #ifdef RE_TRACK_PATTERN_OFFSETS
14846     if (RExC_offsets) {         /* MJD */
14847         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14848               "reg_node", __LINE__,
14849               PL_reg_name[op],
14850               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14851                 ? "Overwriting end of array!\n" : "OK",
14852               (UV)(RExC_emit - RExC_emit_start),
14853               (UV)(RExC_parse - RExC_start),
14854               (UV)RExC_offsets[0]));
14855         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14856     }
14857 #endif
14858     RExC_emit = ptr;
14859     return(ret);
14860 }
14861
14862 /*
14863 - reganode - emit a node with an argument
14864 */
14865 STATIC regnode *                        /* Location. */
14866 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14867 {
14868     dVAR;
14869     regnode *ptr;
14870     regnode * const ret = RExC_emit;
14871     GET_RE_DEBUG_FLAGS_DECL;
14872
14873     PERL_ARGS_ASSERT_REGANODE;
14874
14875     if (SIZE_ONLY) {
14876         SIZE_ALIGN(RExC_size);
14877         RExC_size += 2;
14878         /*
14879            We can't do this:
14880
14881            assert(2==regarglen[op]+1);
14882
14883            Anything larger than this has to allocate the extra amount.
14884            If we changed this to be:
14885
14886            RExC_size += (1 + regarglen[op]);
14887
14888            then it wouldn't matter. Its not clear what side effect
14889            might come from that so its not done so far.
14890            -- dmq
14891         */
14892         return(ret);
14893     }
14894     if (RExC_emit >= RExC_emit_bound)
14895         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14896                    op, RExC_emit, RExC_emit_bound);
14897
14898     NODE_ALIGN_FILL(ret);
14899     ptr = ret;
14900     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14901 #ifdef RE_TRACK_PATTERN_OFFSETS
14902     if (RExC_offsets) {         /* MJD */
14903         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14904               "reganode",
14905               __LINE__,
14906               PL_reg_name[op],
14907               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14908               "Overwriting end of array!\n" : "OK",
14909               (UV)(RExC_emit - RExC_emit_start),
14910               (UV)(RExC_parse - RExC_start),
14911               (UV)RExC_offsets[0]));
14912         Set_Cur_Node_Offset;
14913     }
14914 #endif
14915     RExC_emit = ptr;
14916     return(ret);
14917 }
14918
14919 /*
14920 - reguni - emit (if appropriate) a Unicode character
14921 */
14922 STATIC STRLEN
14923 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14924 {
14925     dVAR;
14926
14927     PERL_ARGS_ASSERT_REGUNI;
14928
14929     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14930 }
14931
14932 /*
14933 - reginsert - insert an operator in front of already-emitted operand
14934 *
14935 * Means relocating the operand.
14936 */
14937 STATIC void
14938 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14939 {
14940     dVAR;
14941     regnode *src;
14942     regnode *dst;
14943     regnode *place;
14944     const int offset = regarglen[(U8)op];
14945     const int size = NODE_STEP_REGNODE + offset;
14946     GET_RE_DEBUG_FLAGS_DECL;
14947
14948     PERL_ARGS_ASSERT_REGINSERT;
14949     PERL_UNUSED_ARG(depth);
14950 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14951     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14952     if (SIZE_ONLY) {
14953         RExC_size += size;
14954         return;
14955     }
14956
14957     src = RExC_emit;
14958     RExC_emit += size;
14959     dst = RExC_emit;
14960     if (RExC_open_parens) {
14961         int paren;
14962         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14963         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14964             if ( RExC_open_parens[paren] >= opnd ) {
14965                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14966                 RExC_open_parens[paren] += size;
14967             } else {
14968                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14969             }
14970             if ( RExC_close_parens[paren] >= opnd ) {
14971                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14972                 RExC_close_parens[paren] += size;
14973             } else {
14974                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14975             }
14976         }
14977     }
14978
14979     while (src > opnd) {
14980         StructCopy(--src, --dst, regnode);
14981 #ifdef RE_TRACK_PATTERN_OFFSETS
14982         if (RExC_offsets) {     /* MJD 20010112 */
14983             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14984                   "reg_insert",
14985                   __LINE__,
14986                   PL_reg_name[op],
14987                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14988                     ? "Overwriting end of array!\n" : "OK",
14989                   (UV)(src - RExC_emit_start),
14990                   (UV)(dst - RExC_emit_start),
14991                   (UV)RExC_offsets[0]));
14992             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14993             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14994         }
14995 #endif
14996     }
14997
14998
14999     place = opnd;               /* Op node, where operand used to be. */
15000 #ifdef RE_TRACK_PATTERN_OFFSETS
15001     if (RExC_offsets) {         /* MJD */
15002         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15003               "reginsert",
15004               __LINE__,
15005               PL_reg_name[op],
15006               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15007               ? "Overwriting end of array!\n" : "OK",
15008               (UV)(place - RExC_emit_start),
15009               (UV)(RExC_parse - RExC_start),
15010               (UV)RExC_offsets[0]));
15011         Set_Node_Offset(place, RExC_parse);
15012         Set_Node_Length(place, 1);
15013     }
15014 #endif
15015     src = NEXTOPER(place);
15016     FILL_ADVANCE_NODE(place, op);
15017     Zero(src, offset, regnode);
15018 }
15019
15020 /*
15021 - regtail - set the next-pointer at the end of a node chain of p to val.
15022 - SEE ALSO: regtail_study
15023 */
15024 /* TODO: All three parms should be const */
15025 STATIC void
15026 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15027 {
15028     dVAR;
15029     regnode *scan;
15030     GET_RE_DEBUG_FLAGS_DECL;
15031
15032     PERL_ARGS_ASSERT_REGTAIL;
15033 #ifndef DEBUGGING
15034     PERL_UNUSED_ARG(depth);
15035 #endif
15036
15037     if (SIZE_ONLY)
15038         return;
15039
15040     /* Find last node. */
15041     scan = p;
15042     for (;;) {
15043         regnode * const temp = regnext(scan);
15044         DEBUG_PARSE_r({
15045             SV * const mysv=sv_newmortal();
15046             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15047             regprop(RExC_rx, mysv, scan);
15048             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15049                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15050                     (temp == NULL ? "->" : ""),
15051                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15052             );
15053         });
15054         if (temp == NULL)
15055             break;
15056         scan = temp;
15057     }
15058
15059     if (reg_off_by_arg[OP(scan)]) {
15060         ARG_SET(scan, val - scan);
15061     }
15062     else {
15063         NEXT_OFF(scan) = val - scan;
15064     }
15065 }
15066
15067 #ifdef DEBUGGING
15068 /*
15069 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15070 - Look for optimizable sequences at the same time.
15071 - currently only looks for EXACT chains.
15072
15073 This is experimental code. The idea is to use this routine to perform
15074 in place optimizations on branches and groups as they are constructed,
15075 with the long term intention of removing optimization from study_chunk so
15076 that it is purely analytical.
15077
15078 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15079 to control which is which.
15080
15081 */
15082 /* TODO: All four parms should be const */
15083
15084 STATIC U8
15085 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
15086 {
15087     dVAR;
15088     regnode *scan;
15089     U8 exact = PSEUDO;
15090 #ifdef EXPERIMENTAL_INPLACESCAN
15091     I32 min = 0;
15092 #endif
15093     GET_RE_DEBUG_FLAGS_DECL;
15094
15095     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15096
15097
15098     if (SIZE_ONLY)
15099         return exact;
15100
15101     /* Find last node. */
15102
15103     scan = p;
15104     for (;;) {
15105         regnode * const temp = regnext(scan);
15106 #ifdef EXPERIMENTAL_INPLACESCAN
15107         if (PL_regkind[OP(scan)] == EXACT) {
15108             bool has_exactf_sharp_s;    /* Unexamined in this routine */
15109             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
15110                 return EXACT;
15111         }
15112 #endif
15113         if ( exact ) {
15114             switch (OP(scan)) {
15115                 case EXACT:
15116                 case EXACTF:
15117                 case EXACTFA_NO_TRIE:
15118                 case EXACTFA:
15119                 case EXACTFU:
15120                 case EXACTFU_SS:
15121                 case EXACTFL:
15122                         if( exact == PSEUDO )
15123                             exact= OP(scan);
15124                         else if ( exact != OP(scan) )
15125                             exact= 0;
15126                 case NOTHING:
15127                     break;
15128                 default:
15129                     exact= 0;
15130             }
15131         }
15132         DEBUG_PARSE_r({
15133             SV * const mysv=sv_newmortal();
15134             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15135             regprop(RExC_rx, mysv, scan);
15136             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15137                 SvPV_nolen_const(mysv),
15138                 REG_NODE_NUM(scan),
15139                 PL_reg_name[exact]);
15140         });
15141         if (temp == NULL)
15142             break;
15143         scan = temp;
15144     }
15145     DEBUG_PARSE_r({
15146         SV * const mysv_val=sv_newmortal();
15147         DEBUG_PARSE_MSG("");
15148         regprop(RExC_rx, mysv_val, val);
15149         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15150                       SvPV_nolen_const(mysv_val),
15151                       (IV)REG_NODE_NUM(val),
15152                       (IV)(val - scan)
15153         );
15154     });
15155     if (reg_off_by_arg[OP(scan)]) {
15156         ARG_SET(scan, val - scan);
15157     }
15158     else {
15159         NEXT_OFF(scan) = val - scan;
15160     }
15161
15162     return exact;
15163 }
15164 #endif
15165
15166 /*
15167  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15168  */
15169 #ifdef DEBUGGING
15170
15171 static void
15172 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15173 {
15174     int bit;
15175     int set=0;
15176
15177     for (bit=0; bit<32; bit++) {
15178         if (flags & (1<<bit)) {
15179             if (!set++ && lead)
15180                 PerlIO_printf(Perl_debug_log, "%s",lead);
15181             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15182         }
15183     }
15184     if (lead)  {
15185         if (set)
15186             PerlIO_printf(Perl_debug_log, "\n");
15187         else
15188             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15189     }
15190 }
15191
15192 static void
15193 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15194 {
15195     int bit;
15196     int set=0;
15197     regex_charset cs;
15198
15199     for (bit=0; bit<32; bit++) {
15200         if (flags & (1<<bit)) {
15201             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15202                 continue;
15203             }
15204             if (!set++ && lead)
15205                 PerlIO_printf(Perl_debug_log, "%s",lead);
15206             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15207         }
15208     }
15209     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15210             if (!set++ && lead) {
15211                 PerlIO_printf(Perl_debug_log, "%s",lead);
15212             }
15213             switch (cs) {
15214                 case REGEX_UNICODE_CHARSET:
15215                     PerlIO_printf(Perl_debug_log, "UNICODE");
15216                     break;
15217                 case REGEX_LOCALE_CHARSET:
15218                     PerlIO_printf(Perl_debug_log, "LOCALE");
15219                     break;
15220                 case REGEX_ASCII_RESTRICTED_CHARSET:
15221                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15222                     break;
15223                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15224                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15225                     break;
15226                 default:
15227                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15228                     break;
15229             }
15230     }
15231     if (lead)  {
15232         if (set)
15233             PerlIO_printf(Perl_debug_log, "\n");
15234         else
15235             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15236     }
15237 }
15238 #endif
15239
15240 void
15241 Perl_regdump(pTHX_ const regexp *r)
15242 {
15243 #ifdef DEBUGGING
15244     dVAR;
15245     SV * const sv = sv_newmortal();
15246     SV *dsv= sv_newmortal();
15247     RXi_GET_DECL(r,ri);
15248     GET_RE_DEBUG_FLAGS_DECL;
15249
15250     PERL_ARGS_ASSERT_REGDUMP;
15251
15252     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15253
15254     /* Header fields of interest. */
15255     if (r->anchored_substr) {
15256         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15257             RE_SV_DUMPLEN(r->anchored_substr), 30);
15258         PerlIO_printf(Perl_debug_log,
15259                       "anchored %s%s at %"IVdf" ",
15260                       s, RE_SV_TAIL(r->anchored_substr),
15261                       (IV)r->anchored_offset);
15262     } else if (r->anchored_utf8) {
15263         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15264             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15265         PerlIO_printf(Perl_debug_log,
15266                       "anchored utf8 %s%s at %"IVdf" ",
15267                       s, RE_SV_TAIL(r->anchored_utf8),
15268                       (IV)r->anchored_offset);
15269     }
15270     if (r->float_substr) {
15271         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15272             RE_SV_DUMPLEN(r->float_substr), 30);
15273         PerlIO_printf(Perl_debug_log,
15274                       "floating %s%s at %"IVdf"..%"UVuf" ",
15275                       s, RE_SV_TAIL(r->float_substr),
15276                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15277     } else if (r->float_utf8) {
15278         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15279             RE_SV_DUMPLEN(r->float_utf8), 30);
15280         PerlIO_printf(Perl_debug_log,
15281                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15282                       s, RE_SV_TAIL(r->float_utf8),
15283                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15284     }
15285     if (r->check_substr || r->check_utf8)
15286         PerlIO_printf(Perl_debug_log,
15287                       (const char *)
15288                       (r->check_substr == r->float_substr
15289                        && r->check_utf8 == r->float_utf8
15290                        ? "(checking floating" : "(checking anchored"));
15291     if (r->extflags & RXf_NOSCAN)
15292         PerlIO_printf(Perl_debug_log, " noscan");
15293     if (r->extflags & RXf_CHECK_ALL)
15294         PerlIO_printf(Perl_debug_log, " isall");
15295     if (r->check_substr || r->check_utf8)
15296         PerlIO_printf(Perl_debug_log, ") ");
15297
15298     if (ri->regstclass) {
15299         regprop(r, sv, ri->regstclass);
15300         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15301     }
15302     if (r->extflags & RXf_ANCH) {
15303         PerlIO_printf(Perl_debug_log, "anchored");
15304         if (r->extflags & RXf_ANCH_BOL)
15305             PerlIO_printf(Perl_debug_log, "(BOL)");
15306         if (r->extflags & RXf_ANCH_MBOL)
15307             PerlIO_printf(Perl_debug_log, "(MBOL)");
15308         if (r->extflags & RXf_ANCH_SBOL)
15309             PerlIO_printf(Perl_debug_log, "(SBOL)");
15310         if (r->extflags & RXf_ANCH_GPOS)
15311             PerlIO_printf(Perl_debug_log, "(GPOS)");
15312         PerlIO_putc(Perl_debug_log, ' ');
15313     }
15314     if (r->extflags & RXf_GPOS_SEEN)
15315         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15316     if (r->intflags & PREGf_SKIP)
15317         PerlIO_printf(Perl_debug_log, "plus ");
15318     if (r->intflags & PREGf_IMPLICIT)
15319         PerlIO_printf(Perl_debug_log, "implicit ");
15320     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15321     if (r->extflags & RXf_EVAL_SEEN)
15322         PerlIO_printf(Perl_debug_log, "with eval ");
15323     PerlIO_printf(Perl_debug_log, "\n");
15324     DEBUG_FLAGS_r({
15325         regdump_extflags("r->extflags: ",r->extflags);
15326         regdump_intflags("r->intflags: ",r->intflags);
15327     });
15328 #else
15329     PERL_ARGS_ASSERT_REGDUMP;
15330     PERL_UNUSED_CONTEXT;
15331     PERL_UNUSED_ARG(r);
15332 #endif  /* DEBUGGING */
15333 }
15334
15335 /*
15336 - regprop - printable representation of opcode
15337 */
15338
15339 void
15340 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15341 {
15342 #ifdef DEBUGGING
15343     dVAR;
15344     int k;
15345
15346     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15347     static const char * const anyofs[] = {
15348 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15349     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15350     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15351     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15352     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15353     || _CC_VERTSPACE != 16
15354   #error Need to adjust order of anyofs[]
15355 #endif
15356         "\\w",
15357         "\\W",
15358         "\\d",
15359         "\\D",
15360         "[:alpha:]",
15361         "[:^alpha:]",
15362         "[:lower:]",
15363         "[:^lower:]",
15364         "[:upper:]",
15365         "[:^upper:]",
15366         "[:punct:]",
15367         "[:^punct:]",
15368         "[:print:]",
15369         "[:^print:]",
15370         "[:alnum:]",
15371         "[:^alnum:]",
15372         "[:graph:]",
15373         "[:^graph:]",
15374         "[:cased:]",
15375         "[:^cased:]",
15376         "\\s",
15377         "\\S",
15378         "[:blank:]",
15379         "[:^blank:]",
15380         "[:xdigit:]",
15381         "[:^xdigit:]",
15382         "[:space:]",
15383         "[:^space:]",
15384         "[:cntrl:]",
15385         "[:^cntrl:]",
15386         "[:ascii:]",
15387         "[:^ascii:]",
15388         "\\v",
15389         "\\V"
15390     };
15391     RXi_GET_DECL(prog,progi);
15392     GET_RE_DEBUG_FLAGS_DECL;
15393
15394     PERL_ARGS_ASSERT_REGPROP;
15395
15396     sv_setpvs(sv, "");
15397
15398     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15399         /* It would be nice to FAIL() here, but this may be called from
15400            regexec.c, and it would be hard to supply pRExC_state. */
15401         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15402     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15403
15404     k = PL_regkind[OP(o)];
15405
15406     if (k == EXACT) {
15407         sv_catpvs(sv, " ");
15408         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15409          * is a crude hack but it may be the best for now since
15410          * we have no flag "this EXACTish node was UTF-8"
15411          * --jhi */
15412         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15413                   PERL_PV_ESCAPE_UNI_DETECT |
15414                   PERL_PV_ESCAPE_NONASCII   |
15415                   PERL_PV_PRETTY_ELLIPSES   |
15416                   PERL_PV_PRETTY_LTGT       |
15417                   PERL_PV_PRETTY_NOCLEAR
15418                   );
15419     } else if (k == TRIE) {
15420         /* print the details of the trie in dumpuntil instead, as
15421          * progi->data isn't available here */
15422         const char op = OP(o);
15423         const U32 n = ARG(o);
15424         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15425                (reg_ac_data *)progi->data->data[n] :
15426                NULL;
15427         const reg_trie_data * const trie
15428             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15429
15430         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15431         DEBUG_TRIE_COMPILE_r(
15432             Perl_sv_catpvf(aTHX_ sv,
15433                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15434                 (UV)trie->startstate,
15435                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15436                 (UV)trie->wordcount,
15437                 (UV)trie->minlen,
15438                 (UV)trie->maxlen,
15439                 (UV)TRIE_CHARCOUNT(trie),
15440                 (UV)trie->uniquecharcount
15441             )
15442         );
15443         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15444             sv_catpvs(sv, "[");
15445             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15446                                                    ? ANYOF_BITMAP(o)
15447                                                    : TRIE_BITMAP(trie));
15448             sv_catpvs(sv, "]");
15449         }
15450
15451     } else if (k == CURLY) {
15452         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15453             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15454         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15455     }
15456     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15457         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15458     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15459         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15460         if ( RXp_PAREN_NAMES(prog) ) {
15461             if ( k != REF || (OP(o) < NREF)) {
15462                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15463                 SV **name= av_fetch(list, ARG(o), 0 );
15464                 if (name)
15465                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15466             }
15467             else {
15468                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15469                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15470                 I32 *nums=(I32*)SvPVX(sv_dat);
15471                 SV **name= av_fetch(list, nums[0], 0 );
15472                 I32 n;
15473                 if (name) {
15474                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15475                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15476                                     (n ? "," : ""), (IV)nums[n]);
15477                     }
15478                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15479                 }
15480             }
15481         }
15482     } else if (k == GOSUB)
15483         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15484     else if (k == VERB) {
15485         if (!o->flags)
15486             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15487                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15488     } else if (k == LOGICAL)
15489         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
15490     else if (k == ANYOF) {
15491         const U8 flags = ANYOF_FLAGS(o);
15492         int do_sep = 0;
15493
15494
15495         if (flags & ANYOF_LOCALE)
15496             sv_catpvs(sv, "{loc}");
15497         if (flags & ANYOF_LOC_FOLD)
15498             sv_catpvs(sv, "{i}");
15499         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15500         if (flags & ANYOF_INVERT)
15501             sv_catpvs(sv, "^");
15502
15503         /* output what the standard cp 0-255 bitmap matches */
15504         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15505
15506         /* output any special charclass tests (used entirely under use
15507          * locale) * */
15508         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15509             int i;
15510             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15511                 if (ANYOF_POSIXL_TEST(o,i)) {
15512                     sv_catpv(sv, anyofs[i]);
15513                     do_sep = 1;
15514                 }
15515             }
15516         }
15517
15518         if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15519             || ANYOF_NONBITMAP(o))
15520         {
15521             if (do_sep) {
15522                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15523                 if (flags & ANYOF_INVERT)
15524                     /*make sure the invert info is in each */
15525                     sv_catpvs(sv, "^");
15526             }
15527
15528         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15529             sv_catpvs(sv, "{non-utf8-latin1-all}");
15530         }
15531
15532         /* output information about the unicode matching */
15533         if (flags & ANYOF_ABOVE_LATIN1_ALL)
15534             sv_catpvs(sv, "{unicode_all}");
15535         else if (ANYOF_NONBITMAP(o)) {
15536             SV *lv; /* Set if there is something outside the bit map. */
15537             bool byte_output = FALSE;   /* If something in the bitmap has been
15538                                            output */
15539
15540             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15541                 sv_catpvs(sv, "{outside bitmap}");
15542             }
15543             else {
15544                 sv_catpvs(sv, "{utf8}");
15545             }
15546
15547             /* Get the stuff that wasn't in the bitmap */
15548             (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15549             if (lv && lv != &PL_sv_undef) {
15550                 char *s = savesvpv(lv);
15551                 char * const origs = s;
15552
15553                 while (*s && *s != '\n')
15554                     s++;
15555
15556                 if (*s == '\n') {
15557                     const char * const t = ++s;
15558
15559                     if (byte_output) {
15560                         sv_catpvs(sv, " ");
15561                     }
15562
15563                     while (*s) {
15564                         if (*s == '\n') {
15565
15566                             /* Truncate very long output */
15567                             if (s - origs > 256) {
15568                                 Perl_sv_catpvf(aTHX_ sv,
15569                                                "%.*s...",
15570                                                (int) (s - origs - 1),
15571                                                t);
15572                                 goto out_dump;
15573                             }
15574                             *s = ' ';
15575                         }
15576                         else if (*s == '\t') {
15577                             *s = '-';
15578                         }
15579                         s++;
15580                     }
15581                     if (s[-1] == ' ')
15582                         s[-1] = 0;
15583
15584                     sv_catpv(sv, t);
15585                 }
15586
15587             out_dump:
15588
15589                 Safefree(origs);
15590                 SvREFCNT_dec_NN(lv);
15591             }
15592         }
15593         }
15594
15595         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15596     }
15597     else if (k == POSIXD || k == NPOSIXD) {
15598         U8 index = FLAGS(o) * 2;
15599         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15600             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15601         }
15602         else {
15603             if (*anyofs[index] != '[')  {
15604                 sv_catpv(sv, "[");
15605             }
15606             sv_catpv(sv, anyofs[index]);
15607             if (*anyofs[index] != '[')  {
15608                 sv_catpv(sv, "]");
15609             }
15610         }
15611     }
15612     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15613         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15614 #else
15615     PERL_UNUSED_CONTEXT;
15616     PERL_UNUSED_ARG(sv);
15617     PERL_UNUSED_ARG(o);
15618     PERL_UNUSED_ARG(prog);
15619 #endif  /* DEBUGGING */
15620 }
15621
15622 SV *
15623 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15624 {                               /* Assume that RE_INTUIT is set */
15625     dVAR;
15626     struct regexp *const prog = ReANY(r);
15627     GET_RE_DEBUG_FLAGS_DECL;
15628
15629     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15630     PERL_UNUSED_CONTEXT;
15631
15632     DEBUG_COMPILE_r(
15633         {
15634             const char * const s = SvPV_nolen_const(prog->check_substr
15635                       ? prog->check_substr : prog->check_utf8);
15636
15637             if (!PL_colorset) reginitcolors();
15638             PerlIO_printf(Perl_debug_log,
15639                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15640                       PL_colors[4],
15641                       prog->check_substr ? "" : "utf8 ",
15642                       PL_colors[5],PL_colors[0],
15643                       s,
15644                       PL_colors[1],
15645                       (strlen(s) > 60 ? "..." : ""));
15646         } );
15647
15648     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15649 }
15650
15651 /*
15652    pregfree()
15653
15654    handles refcounting and freeing the perl core regexp structure. When
15655    it is necessary to actually free the structure the first thing it
15656    does is call the 'free' method of the regexp_engine associated to
15657    the regexp, allowing the handling of the void *pprivate; member
15658    first. (This routine is not overridable by extensions, which is why
15659    the extensions free is called first.)
15660
15661    See regdupe and regdupe_internal if you change anything here.
15662 */
15663 #ifndef PERL_IN_XSUB_RE
15664 void
15665 Perl_pregfree(pTHX_ REGEXP *r)
15666 {
15667     SvREFCNT_dec(r);
15668 }
15669
15670 void
15671 Perl_pregfree2(pTHX_ REGEXP *rx)
15672 {
15673     dVAR;
15674     struct regexp *const r = ReANY(rx);
15675     GET_RE_DEBUG_FLAGS_DECL;
15676
15677     PERL_ARGS_ASSERT_PREGFREE2;
15678
15679     if (r->mother_re) {
15680         ReREFCNT_dec(r->mother_re);
15681     } else {
15682         CALLREGFREE_PVT(rx); /* free the private data */
15683         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15684         Safefree(r->xpv_len_u.xpvlenu_pv);
15685     }
15686     if (r->substrs) {
15687         SvREFCNT_dec(r->anchored_substr);
15688         SvREFCNT_dec(r->anchored_utf8);
15689         SvREFCNT_dec(r->float_substr);
15690         SvREFCNT_dec(r->float_utf8);
15691         Safefree(r->substrs);
15692     }
15693     RX_MATCH_COPY_FREE(rx);
15694 #ifdef PERL_ANY_COW
15695     SvREFCNT_dec(r->saved_copy);
15696 #endif
15697     Safefree(r->offs);
15698     SvREFCNT_dec(r->qr_anoncv);
15699     rx->sv_u.svu_rx = 0;
15700 }
15701
15702 /*  reg_temp_copy()
15703
15704     This is a hacky workaround to the structural issue of match results
15705     being stored in the regexp structure which is in turn stored in
15706     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15707     could be PL_curpm in multiple contexts, and could require multiple
15708     result sets being associated with the pattern simultaneously, such
15709     as when doing a recursive match with (??{$qr})
15710
15711     The solution is to make a lightweight copy of the regexp structure
15712     when a qr// is returned from the code executed by (??{$qr}) this
15713     lightweight copy doesn't actually own any of its data except for
15714     the starp/end and the actual regexp structure itself.
15715
15716 */
15717
15718
15719 REGEXP *
15720 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15721 {
15722     struct regexp *ret;
15723     struct regexp *const r = ReANY(rx);
15724     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15725
15726     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15727
15728     if (!ret_x)
15729         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15730     else {
15731         SvOK_off((SV *)ret_x);
15732         if (islv) {
15733             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15734                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15735                made both spots point to the same regexp body.) */
15736             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15737             assert(!SvPVX(ret_x));
15738             ret_x->sv_u.svu_rx = temp->sv_any;
15739             temp->sv_any = NULL;
15740             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15741             SvREFCNT_dec_NN(temp);
15742             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15743                ing below will not set it. */
15744             SvCUR_set(ret_x, SvCUR(rx));
15745         }
15746     }
15747     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15748        sv_force_normal(sv) is called.  */
15749     SvFAKE_on(ret_x);
15750     ret = ReANY(ret_x);
15751
15752     SvFLAGS(ret_x) |= SvUTF8(rx);
15753     /* We share the same string buffer as the original regexp, on which we
15754        hold a reference count, incremented when mother_re is set below.
15755        The string pointer is copied here, being part of the regexp struct.
15756      */
15757     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15758            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15759     if (r->offs) {
15760         const I32 npar = r->nparens+1;
15761         Newx(ret->offs, npar, regexp_paren_pair);
15762         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15763     }
15764     if (r->substrs) {
15765         Newx(ret->substrs, 1, struct reg_substr_data);
15766         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15767
15768         SvREFCNT_inc_void(ret->anchored_substr);
15769         SvREFCNT_inc_void(ret->anchored_utf8);
15770         SvREFCNT_inc_void(ret->float_substr);
15771         SvREFCNT_inc_void(ret->float_utf8);
15772
15773         /* check_substr and check_utf8, if non-NULL, point to either their
15774            anchored or float namesakes, and don't hold a second reference.  */
15775     }
15776     RX_MATCH_COPIED_off(ret_x);
15777 #ifdef PERL_ANY_COW
15778     ret->saved_copy = NULL;
15779 #endif
15780     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15781     SvREFCNT_inc_void(ret->qr_anoncv);
15782
15783     return ret_x;
15784 }
15785 #endif
15786
15787 /* regfree_internal()
15788
15789    Free the private data in a regexp. This is overloadable by
15790    extensions. Perl takes care of the regexp structure in pregfree(),
15791    this covers the *pprivate pointer which technically perl doesn't
15792    know about, however of course we have to handle the
15793    regexp_internal structure when no extension is in use.
15794
15795    Note this is called before freeing anything in the regexp
15796    structure.
15797  */
15798
15799 void
15800 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15801 {
15802     dVAR;
15803     struct regexp *const r = ReANY(rx);
15804     RXi_GET_DECL(r,ri);
15805     GET_RE_DEBUG_FLAGS_DECL;
15806
15807     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15808
15809     DEBUG_COMPILE_r({
15810         if (!PL_colorset)
15811             reginitcolors();
15812         {
15813             SV *dsv= sv_newmortal();
15814             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15815                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15816             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15817                 PL_colors[4],PL_colors[5],s);
15818         }
15819     });
15820 #ifdef RE_TRACK_PATTERN_OFFSETS
15821     if (ri->u.offsets)
15822         Safefree(ri->u.offsets);             /* 20010421 MJD */
15823 #endif
15824     if (ri->code_blocks) {
15825         int n;
15826         for (n = 0; n < ri->num_code_blocks; n++)
15827             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15828         Safefree(ri->code_blocks);
15829     }
15830
15831     if (ri->data) {
15832         int n = ri->data->count;
15833
15834         while (--n >= 0) {
15835           /* If you add a ->what type here, update the comment in regcomp.h */
15836             switch (ri->data->what[n]) {
15837             case 'a':
15838             case 'r':
15839             case 's':
15840             case 'S':
15841             case 'u':
15842                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15843                 break;
15844             case 'f':
15845                 Safefree(ri->data->data[n]);
15846                 break;
15847             case 'l':
15848             case 'L':
15849                 break;
15850             case 'T':
15851                 { /* Aho Corasick add-on structure for a trie node.
15852                      Used in stclass optimization only */
15853                     U32 refcount;
15854                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15855                     OP_REFCNT_LOCK;
15856                     refcount = --aho->refcount;
15857                     OP_REFCNT_UNLOCK;
15858                     if ( !refcount ) {
15859                         PerlMemShared_free(aho->states);
15860                         PerlMemShared_free(aho->fail);
15861                          /* do this last!!!! */
15862                         PerlMemShared_free(ri->data->data[n]);
15863                         PerlMemShared_free(ri->regstclass);
15864                     }
15865                 }
15866                 break;
15867             case 't':
15868                 {
15869                     /* trie structure. */
15870                     U32 refcount;
15871                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15872                     OP_REFCNT_LOCK;
15873                     refcount = --trie->refcount;
15874                     OP_REFCNT_UNLOCK;
15875                     if ( !refcount ) {
15876                         PerlMemShared_free(trie->charmap);
15877                         PerlMemShared_free(trie->states);
15878                         PerlMemShared_free(trie->trans);
15879                         if (trie->bitmap)
15880                             PerlMemShared_free(trie->bitmap);
15881                         if (trie->jump)
15882                             PerlMemShared_free(trie->jump);
15883                         PerlMemShared_free(trie->wordinfo);
15884                         /* do this last!!!! */
15885                         PerlMemShared_free(ri->data->data[n]);
15886                     }
15887                 }
15888                 break;
15889             default:
15890                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15891             }
15892         }
15893         Safefree(ri->data->what);
15894         Safefree(ri->data);
15895     }
15896
15897     Safefree(ri);
15898 }
15899
15900 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15901 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15902 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15903
15904 /*
15905    re_dup - duplicate a regexp.
15906
15907    This routine is expected to clone a given regexp structure. It is only
15908    compiled under USE_ITHREADS.
15909
15910    After all of the core data stored in struct regexp is duplicated
15911    the regexp_engine.dupe method is used to copy any private data
15912    stored in the *pprivate pointer. This allows extensions to handle
15913    any duplication it needs to do.
15914
15915    See pregfree() and regfree_internal() if you change anything here.
15916 */
15917 #if defined(USE_ITHREADS)
15918 #ifndef PERL_IN_XSUB_RE
15919 void
15920 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15921 {
15922     dVAR;
15923     I32 npar;
15924     const struct regexp *r = ReANY(sstr);
15925     struct regexp *ret = ReANY(dstr);
15926
15927     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15928
15929     npar = r->nparens+1;
15930     Newx(ret->offs, npar, regexp_paren_pair);
15931     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15932
15933     if (ret->substrs) {
15934         /* Do it this way to avoid reading from *r after the StructCopy().
15935            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15936            cache, it doesn't matter.  */
15937         const bool anchored = r->check_substr
15938             ? r->check_substr == r->anchored_substr
15939             : r->check_utf8 == r->anchored_utf8;
15940         Newx(ret->substrs, 1, struct reg_substr_data);
15941         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15942
15943         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15944         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15945         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15946         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15947
15948         /* check_substr and check_utf8, if non-NULL, point to either their
15949            anchored or float namesakes, and don't hold a second reference.  */
15950
15951         if (ret->check_substr) {
15952             if (anchored) {
15953                 assert(r->check_utf8 == r->anchored_utf8);
15954                 ret->check_substr = ret->anchored_substr;
15955                 ret->check_utf8 = ret->anchored_utf8;
15956             } else {
15957                 assert(r->check_substr == r->float_substr);
15958                 assert(r->check_utf8 == r->float_utf8);
15959                 ret->check_substr = ret->float_substr;
15960                 ret->check_utf8 = ret->float_utf8;
15961             }
15962         } else if (ret->check_utf8) {
15963             if (anchored) {
15964                 ret->check_utf8 = ret->anchored_utf8;
15965             } else {
15966                 ret->check_utf8 = ret->float_utf8;
15967             }
15968         }
15969     }
15970
15971     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15972     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15973
15974     if (ret->pprivate)
15975         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15976
15977     if (RX_MATCH_COPIED(dstr))
15978         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15979     else
15980         ret->subbeg = NULL;
15981 #ifdef PERL_ANY_COW
15982     ret->saved_copy = NULL;
15983 #endif
15984
15985     /* Whether mother_re be set or no, we need to copy the string.  We
15986        cannot refrain from copying it when the storage points directly to
15987        our mother regexp, because that's
15988                1: a buffer in a different thread
15989                2: something we no longer hold a reference on
15990                so we need to copy it locally.  */
15991     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15992     ret->mother_re   = NULL;
15993 }
15994 #endif /* PERL_IN_XSUB_RE */
15995
15996 /*
15997    regdupe_internal()
15998
15999    This is the internal complement to regdupe() which is used to copy
16000    the structure pointed to by the *pprivate pointer in the regexp.
16001    This is the core version of the extension overridable cloning hook.
16002    The regexp structure being duplicated will be copied by perl prior
16003    to this and will be provided as the regexp *r argument, however
16004    with the /old/ structures pprivate pointer value. Thus this routine
16005    may override any copying normally done by perl.
16006
16007    It returns a pointer to the new regexp_internal structure.
16008 */
16009
16010 void *
16011 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16012 {
16013     dVAR;
16014     struct regexp *const r = ReANY(rx);
16015     regexp_internal *reti;
16016     int len;
16017     RXi_GET_DECL(r,ri);
16018
16019     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16020
16021     len = ProgLen(ri);
16022
16023     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
16024     Copy(ri->program, reti->program, len+1, regnode);
16025
16026     reti->num_code_blocks = ri->num_code_blocks;
16027     if (ri->code_blocks) {
16028         int n;
16029         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16030                 struct reg_code_block);
16031         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16032                 struct reg_code_block);
16033         for (n = 0; n < ri->num_code_blocks; n++)
16034              reti->code_blocks[n].src_regex = (REGEXP*)
16035                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16036     }
16037     else
16038         reti->code_blocks = NULL;
16039
16040     reti->regstclass = NULL;
16041
16042     if (ri->data) {
16043         struct reg_data *d;
16044         const int count = ri->data->count;
16045         int i;
16046
16047         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16048                 char, struct reg_data);
16049         Newx(d->what, count, U8);
16050
16051         d->count = count;
16052         for (i = 0; i < count; i++) {
16053             d->what[i] = ri->data->what[i];
16054             switch (d->what[i]) {
16055                 /* see also regcomp.h and regfree_internal() */
16056             case 'a': /* actually an AV, but the dup function is identical.  */
16057             case 'r':
16058             case 's':
16059             case 'S':
16060             case 'u': /* actually an HV, but the dup function is identical.  */
16061                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16062                 break;
16063             case 'f':
16064                 /* This is cheating. */
16065                 Newx(d->data[i], 1, regnode_ssc);
16066                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16067                 reti->regstclass = (regnode*)d->data[i];
16068                 break;
16069             case 'T':
16070                 /* Trie stclasses are readonly and can thus be shared
16071                  * without duplication. We free the stclass in pregfree
16072                  * when the corresponding reg_ac_data struct is freed.
16073                  */
16074                 reti->regstclass= ri->regstclass;
16075                 /* Fall through */
16076             case 't':
16077                 OP_REFCNT_LOCK;
16078                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16079                 OP_REFCNT_UNLOCK;
16080                 /* Fall through */
16081             case 'l':
16082             case 'L':
16083                 d->data[i] = ri->data->data[i];
16084                 break;
16085             default:
16086                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
16087             }
16088         }
16089
16090         reti->data = d;
16091     }
16092     else
16093         reti->data = NULL;
16094
16095     reti->name_list_idx = ri->name_list_idx;
16096
16097 #ifdef RE_TRACK_PATTERN_OFFSETS
16098     if (ri->u.offsets) {
16099         Newx(reti->u.offsets, 2*len+1, U32);
16100         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16101     }
16102 #else
16103     SetProgLen(reti,len);
16104 #endif
16105
16106     return (void*)reti;
16107 }
16108
16109 #endif    /* USE_ITHREADS */
16110
16111 #ifndef PERL_IN_XSUB_RE
16112
16113 /*
16114  - regnext - dig the "next" pointer out of a node
16115  */
16116 regnode *
16117 Perl_regnext(pTHX_ regnode *p)
16118 {
16119     dVAR;
16120     I32 offset;
16121
16122     if (!p)
16123         return(NULL);
16124
16125     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16126         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
16127     }
16128
16129     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16130     if (offset == 0)
16131         return(NULL);
16132
16133     return(p+offset);
16134 }
16135 #endif
16136
16137 STATIC void
16138 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16139 {
16140     va_list args;
16141     STRLEN l1 = strlen(pat1);
16142     STRLEN l2 = strlen(pat2);
16143     char buf[512];
16144     SV *msv;
16145     const char *message;
16146
16147     PERL_ARGS_ASSERT_RE_CROAK2;
16148
16149     if (l1 > 510)
16150         l1 = 510;
16151     if (l1 + l2 > 510)
16152         l2 = 510 - l1;
16153     Copy(pat1, buf, l1 , char);
16154     Copy(pat2, buf + l1, l2 , char);
16155     buf[l1 + l2] = '\n';
16156     buf[l1 + l2 + 1] = '\0';
16157     va_start(args, pat2);
16158     msv = vmess(buf, &args);
16159     va_end(args);
16160     message = SvPV_const(msv,l1);
16161     if (l1 > 512)
16162         l1 = 512;
16163     Copy(message, buf, l1 , char);
16164     /* l1-1 to avoid \n */
16165     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16166 }
16167
16168 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16169
16170 #ifndef PERL_IN_XSUB_RE
16171 void
16172 Perl_save_re_context(pTHX)
16173 {
16174     dVAR;
16175
16176     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16177     if (PL_curpm) {
16178         const REGEXP * const rx = PM_GETRE(PL_curpm);
16179         if (rx) {
16180             U32 i;
16181             for (i = 1; i <= RX_NPARENS(rx); i++) {
16182                 char digits[TYPE_CHARS(long)];
16183                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
16184                 GV *const *const gvp
16185                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16186
16187                 if (gvp) {
16188                     GV * const gv = *gvp;
16189                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16190                         save_scalar(gv);
16191                 }
16192             }
16193         }
16194     }
16195 }
16196 #endif
16197
16198 #ifdef DEBUGGING
16199
16200 STATIC void
16201 S_put_byte(pTHX_ SV *sv, int c)
16202 {
16203     PERL_ARGS_ASSERT_PUT_BYTE;
16204
16205     if (!isPRINT(c)) {
16206         switch (c) {
16207             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16208             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16209             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16210             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16211             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16212
16213             default:
16214                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16215                 break;
16216         }
16217     }
16218     else {
16219         const char string = c;
16220         if (c == '-' || c == ']' || c == '\\' || c == '^')
16221             sv_catpvs(sv, "\\");
16222         sv_catpvn(sv, &string, 1);
16223     }
16224 }
16225
16226 STATIC bool
16227 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16228 {
16229     /* Appends to 'sv' a displayable version of the innards of the bracketed
16230      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16231      * output anything */
16232
16233     int i;
16234     int rangestart = -1;
16235     bool has_output_anything = FALSE;
16236
16237     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16238
16239     for (i = 0; i <= 256; i++) {
16240         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16241             if (rangestart == -1)
16242                 rangestart = i;
16243         } else if (rangestart != -1) {
16244             int j = i - 1;
16245             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
16246                 for (; rangestart < i; rangestart++)
16247                     put_byte(sv, rangestart);
16248             }
16249             else if (   j > 255
16250                      || ! isALPHANUMERIC(rangestart)
16251                      || ! isALPHANUMERIC(j)
16252                      || isDIGIT(rangestart) != isDIGIT(j)
16253                      || isUPPER(rangestart) != isUPPER(j)
16254                      || isLOWER(rangestart) != isLOWER(j)
16255
16256                         /* This final test should get optimized out except
16257                          * on EBCDIC platforms, where it causes ranges that
16258                          * cross discontinuities like i/j to be shown as hex
16259                          * instead of the misleading, e.g. H-K (since that
16260                          * range includes more than H, I, J, K). */
16261                      || (j - rangestart)
16262                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16263             {
16264                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16265                                rangestart,
16266                                (j < 256) ? j : 255);
16267             }
16268             else { /* Here, the ends of the range are both digits, or both
16269                       uppercase, or both lowercase; and there's no
16270                       discontinuity in the range (which could happen on EBCDIC
16271                       platforms) */
16272                 put_byte(sv, rangestart);
16273                 sv_catpvs(sv, "-");
16274                 put_byte(sv, j);
16275             }
16276             rangestart = -1;
16277             has_output_anything = TRUE;
16278         }
16279     }
16280
16281     return has_output_anything;
16282 }
16283
16284 #define CLEAR_OPTSTART \
16285     if (optstart) STMT_START { \
16286             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16287             optstart=NULL; \
16288     } STMT_END
16289
16290 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16291
16292 STATIC const regnode *
16293 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16294             const regnode *last, const regnode *plast,
16295             SV* sv, I32 indent, U32 depth)
16296 {
16297     dVAR;
16298     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16299     const regnode *next;
16300     const regnode *optstart= NULL;
16301
16302     RXi_GET_DECL(r,ri);
16303     GET_RE_DEBUG_FLAGS_DECL;
16304
16305     PERL_ARGS_ASSERT_DUMPUNTIL;
16306
16307 #ifdef DEBUG_DUMPUNTIL
16308     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16309         last ? last-start : 0,plast ? plast-start : 0);
16310 #endif
16311
16312     if (plast && plast < last)
16313         last= plast;
16314
16315     while (PL_regkind[op] != END && (!last || node < last)) {
16316         /* While that wasn't END last time... */
16317         NODE_ALIGN(node);
16318         op = OP(node);
16319         if (op == CLOSE || op == WHILEM)
16320             indent--;
16321         next = regnext((regnode *)node);
16322
16323         /* Where, what. */
16324         if (OP(node) == OPTIMIZED) {
16325             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16326                 optstart = node;
16327             else
16328                 goto after_print;
16329         } else
16330             CLEAR_OPTSTART;
16331
16332         regprop(r, sv, node);
16333         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16334                       (int)(2*indent + 1), "", SvPVX_const(sv));
16335
16336         if (OP(node) != OPTIMIZED) {
16337             if (next == NULL)           /* Next ptr. */
16338                 PerlIO_printf(Perl_debug_log, " (0)");
16339             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16340                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16341             else
16342                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16343             (void)PerlIO_putc(Perl_debug_log, '\n');
16344         }
16345
16346       after_print:
16347         if (PL_regkind[(U8)op] == BRANCHJ) {
16348             assert(next);
16349             {
16350                 const regnode *nnode = (OP(next) == LONGJMP
16351                                        ? regnext((regnode *)next)
16352                                        : next);
16353                 if (last && nnode > last)
16354                     nnode = last;
16355                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16356             }
16357         }
16358         else if (PL_regkind[(U8)op] == BRANCH) {
16359             assert(next);
16360             DUMPUNTIL(NEXTOPER(node), next);
16361         }
16362         else if ( PL_regkind[(U8)op]  == TRIE ) {
16363             const regnode *this_trie = node;
16364             const char op = OP(node);
16365             const U32 n = ARG(node);
16366             const reg_ac_data * const ac = op>=AHOCORASICK ?
16367                (reg_ac_data *)ri->data->data[n] :
16368                NULL;
16369             const reg_trie_data * const trie =
16370                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16371 #ifdef DEBUGGING
16372             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16373 #endif
16374             const regnode *nextbranch= NULL;
16375             I32 word_idx;
16376             sv_setpvs(sv, "");
16377             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16378                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16379
16380                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16381                    (int)(2*(indent+3)), "",
16382                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16383                             PL_colors[0], PL_colors[1],
16384                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16385                             PERL_PV_PRETTY_ELLIPSES    |
16386                             PERL_PV_PRETTY_LTGT
16387                             )
16388                             : "???"
16389                 );
16390                 if (trie->jump) {
16391                     U16 dist= trie->jump[word_idx+1];
16392                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16393                                   (UV)((dist ? this_trie + dist : next) - start));
16394                     if (dist) {
16395                         if (!nextbranch)
16396                             nextbranch= this_trie + trie->jump[0];
16397                         DUMPUNTIL(this_trie + dist, nextbranch);
16398                     }
16399                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16400                         nextbranch= regnext((regnode *)nextbranch);
16401                 } else {
16402                     PerlIO_printf(Perl_debug_log, "\n");
16403                 }
16404             }
16405             if (last && next > last)
16406                 node= last;
16407             else
16408                 node= next;
16409         }
16410         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16411             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16412                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16413         }
16414         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16415             assert(next);
16416             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16417         }
16418         else if ( op == PLUS || op == STAR) {
16419             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16420         }
16421         else if (PL_regkind[(U8)op] == ANYOF) {
16422             /* arglen 1 + class block */
16423             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16424                     ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16425             node = NEXTOPER(node);
16426         }
16427         else if (PL_regkind[(U8)op] == EXACT) {
16428             /* Literal string, where present. */
16429             node += NODE_SZ_STR(node) - 1;
16430             node = NEXTOPER(node);
16431         }
16432         else {
16433             node = NEXTOPER(node);
16434             node += regarglen[(U8)op];
16435         }
16436         if (op == CURLYX || op == OPEN)
16437             indent++;
16438     }
16439     CLEAR_OPTSTART;
16440 #ifdef DEBUG_DUMPUNTIL
16441     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16442 #endif
16443     return node;
16444 }
16445
16446 #endif  /* DEBUGGING */
16447
16448 /*
16449  * Local variables:
16450  * c-indentation-style: bsd
16451  * c-basic-offset: 4
16452  * indent-tabs-mode: nil
16453  * End:
16454  *
16455  * ex: set ts=8 sts=4 sw=4 et:
16456  */