regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102
 103 struct RExC_state_t {
 104     U32         flags;                  /* RXf_* are we folding, multilining? */
 105     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 106     char        *precomp;               /* uncompiled string. */
 107     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 108     regexp      *rx;                    /* perl core regexp structure */
 109     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 110     char        *start;                 /* Start of input for compile */
 111     char        *end;                   /* End of input for compile */
 112     char        *parse;                 /* Input-scan pointer. */
 113     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 114     regnode     *emit_start;            /* Start of emitted-code area */
 115     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 116     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 117                                            implies compiling, so don't emit */
 118     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 119                                            large enough for the largest
 120                                            non-EXACTish node, so can use it as
 121                                            scratch in pass1 */
 122     I32         naughty;                /* How bad is this pattern? */
 123     I32         sawback;                /* Did we see \1, ...? */
 124     U32         seen;
 125     SSize_t     size;                   /* Code size. */
 126     I32         npar;                   /* Capture buffer count, (OPEN). */
 127     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 128     I32         nestroot;               /* root parens we are in - used by accept */
 129     I32         extralen;
 130     I32         seen_zerolen;
 131     regnode     **open_parens;          /* pointers to open parens */
 132     regnode     **close_parens;         /* pointers to close parens */
 133     regnode     *opend;                 /* END node in program */
 134     I32         utf8;           /* whether the pattern is utf8 or not */
 135     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 136                                 /* XXX use this for future optimisation of case
 137                                  * where pattern must be upgraded to utf8. */
 138     I32         uni_semantics;  /* If a d charset modifier should use unicode
 139                                    rules, even if the pattern is not in
 140                                    utf8 */
 141     HV          *paren_names;           /* Paren names */
 142
 143     regnode     **recurse;              /* Recurse regops */
 144     I32         recurse_count;          /* Number of recurse regops */
 145     I32         in_lookbehind;
 146     I32         contains_locale;
 147     I32         contains_i;
 148     I32         override_recoding;
 149     I32         in_multi_char_class;
 150     struct reg_code_block *code_blocks; /* positions of literal (?{})
 151                                             within pattern */
 152     int         num_code_blocks;        /* size of code_blocks[] */
 153     int         code_index;             /* next code_blocks[] slot */
 154 #if ADD_TO_REGEXEC
 155     char        *starttry;              /* -Dr: where regtry was called. */
 156 #define RExC_starttry   (pRExC_state->starttry)
 157 #endif
 158     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 159 #ifdef DEBUGGING
 160     const char  *lastparse;
 161     I32         lastnum;
 162     AV          *paren_name_list;       /* idx -> name */
 163 #define RExC_lastparse  (pRExC_state->lastparse)
 164 #define RExC_lastnum    (pRExC_state->lastnum)
 165 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 166 #endif
 167 };
 168
 169 #define RExC_flags      (pRExC_state->flags)
 170 #define RExC_pm_flags   (pRExC_state->pm_flags)
 171 #define RExC_precomp    (pRExC_state->precomp)
 172 #define RExC_rx_sv      (pRExC_state->rx_sv)
 173 #define RExC_rx         (pRExC_state->rx)
 174 #define RExC_rxi        (pRExC_state->rxi)
 175 #define RExC_start      (pRExC_state->start)
 176 #define RExC_end        (pRExC_state->end)
 177 #define RExC_parse      (pRExC_state->parse)
 178 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 179 #ifdef RE_TRACK_PATTERN_OFFSETS
 180 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 181 #endif
 182 #define RExC_emit       (pRExC_state->emit)
 183 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 184 #define RExC_emit_start (pRExC_state->emit_start)
 185 #define RExC_emit_bound (pRExC_state->emit_bound)
 186 #define RExC_naughty    (pRExC_state->naughty)
 187 #define RExC_sawback    (pRExC_state->sawback)
 188 #define RExC_seen       (pRExC_state->seen)
 189 #define RExC_size       (pRExC_state->size)
 190 #define RExC_npar       (pRExC_state->npar)
 191 #define RExC_nestroot   (pRExC_state->nestroot)
 192 #define RExC_extralen   (pRExC_state->extralen)
 193 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 194 #define RExC_utf8       (pRExC_state->utf8)
 195 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 196 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 197 #define RExC_open_parens        (pRExC_state->open_parens)
 198 #define RExC_close_parens       (pRExC_state->close_parens)
 199 #define RExC_opend      (pRExC_state->opend)
 200 #define RExC_paren_names        (pRExC_state->paren_names)
 201 #define RExC_recurse    (pRExC_state->recurse)
 202 #define RExC_recurse_count      (pRExC_state->recurse_count)
 203 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 204 #define RExC_contains_locale    (pRExC_state->contains_locale)
 205 #define RExC_contains_i (pRExC_state->contains_i)
 206 #define RExC_override_recoding (pRExC_state->override_recoding)
 207 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 208
 209
 210 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 211 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 212         ((*s) == '{' && regcurly(s, FALSE)))
 213
 214 /*
 215  * Flags to be passed up and down.
 216  */
 217 #define WORST           0       /* Worst case. */
 218 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 219
 220 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 221  * character.  (There needs to be a case: in the switch statement in regexec.c
 222  * for any node marked SIMPLE.)  Note that this is not the same thing as
 223  * REGNODE_SIMPLE */
 224 #define SIMPLE          0x02
 225 #define SPSTART         0x04    /* Starts with * or + */
 226 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 227 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 228 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 229
 230 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 231
 232 /* whether trie related optimizations are enabled */
 233 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 234 #define TRIE_STUDY_OPT
 235 #define FULL_TRIE_STUDY
 236 #define TRIE_STCLASS
 237 #endif
 238
 239
 240
 241 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 242 #define PBITVAL(paren) (1 << ((paren) & 7))
 243 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 244 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 245 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 246
 247 #define REQUIRE_UTF8    STMT_START {                                       \
 248                                      if (!UTF) {                           \
 249                                          *flagp = RESTART_UTF8;            \
 250                                          return NULL;                      \
 251                                      }                                     \
 252                         } STMT_END
 253
 254 /* This converts the named class defined in regcomp.h to its equivalent class
 255  * number defined in handy.h. */
 256 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 257 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 258
 259 #define _invlist_union_complement_2nd(a, b, output) \
 260                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 261 #define _invlist_intersection_complement_2nd(a, b, output) \
 262                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 263
 264 /* About scan_data_t.
 265
 266   During optimisation we recurse through the regexp program performing
 267   various inplace (keyhole style) optimisations. In addition study_chunk
 268   and scan_commit populate this data structure with information about
 269   what strings MUST appear in the pattern. We look for the longest
 270   string that must appear at a fixed location, and we look for the
 271   longest string that may appear at a floating location. So for instance
 272   in the pattern:
 273
 274     /FOO[xX]A.*B[xX]BAR/
 275
 276   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 277   strings (because they follow a .* construct). study_chunk will identify
 278   both FOO and BAR as being the longest fixed and floating strings respectively.
 279
 280   The strings can be composites, for instance
 281
 282      /(f)(o)(o)/
 283
 284   will result in a composite fixed substring 'foo'.
 285
 286   For each string some basic information is maintained:
 287
 288   - offset or min_offset
 289     This is the position the string must appear at, or not before.
 290     It also implicitly (when combined with minlenp) tells us how many
 291     characters must match before the string we are searching for.
 292     Likewise when combined with minlenp and the length of the string it
 293     tells us how many characters must appear after the string we have
 294     found.
 295
 296   - max_offset
 297     Only used for floating strings. This is the rightmost point that
 298     the string can appear at. If set to SSize_t_MAX it indicates that the
 299     string can occur infinitely far to the right.
 300
 301   - minlenp
 302     A pointer to the minimum number of characters of the pattern that the
 303     string was found inside. This is important as in the case of positive
 304     lookahead or positive lookbehind we can have multiple patterns
 305     involved. Consider
 306
 307     /(?=FOO).*F/
 308
 309     The minimum length of the pattern overall is 3, the minimum length
 310     of the lookahead part is 3, but the minimum length of the part that
 311     will actually match is 1. So 'FOO's minimum length is 3, but the
 312     minimum length for the F is 1. This is important as the minimum length
 313     is used to determine offsets in front of and behind the string being
 314     looked for.  Since strings can be composites this is the length of the
 315     pattern at the time it was committed with a scan_commit. Note that
 316     the length is calculated by study_chunk, so that the minimum lengths
 317     are not known until the full pattern has been compiled, thus the
 318     pointer to the value.
 319
 320   - lookbehind
 321
 322     In the case of lookbehind the string being searched for can be
 323     offset past the start point of the final matching string.
 324     If this value was just blithely removed from the min_offset it would
 325     invalidate some of the calculations for how many chars must match
 326     before or after (as they are derived from min_offset and minlen and
 327     the length of the string being searched for).
 328     When the final pattern is compiled and the data is moved from the
 329     scan_data_t structure into the regexp structure the information
 330     about lookbehind is factored in, with the information that would
 331     have been lost precalculated in the end_shift field for the
 332     associated string.
 333
 334   The fields pos_min and pos_delta are used to store the minimum offset
 335   and the delta to the maximum offset at the current point in the pattern.
 336
 337 */
 338
 339 typedef struct scan_data_t {
 340     /*I32 len_min;      unused */
 341     /*I32 len_delta;    unused */
 342     SSize_t pos_min;
 343     SSize_t pos_delta;
 344     SV *last_found;
 345     SSize_t last_end;       /* min value, <0 unless valid. */
 346     SSize_t last_start_min;
 347     SSize_t last_start_max;
 348     SV **longest;           /* Either &l_fixed, or &l_float. */
 349     SV *longest_fixed;      /* longest fixed string found in pattern */
 350     SSize_t offset_fixed;   /* offset where it starts */
 351     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 352     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 353     SV *longest_float;      /* longest floating string found in pattern */
 354     SSize_t offset_float_min; /* earliest point in string it can appear */
 355     SSize_t offset_float_max; /* latest point in string it can appear */
 356     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 357     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 358     I32 flags;
 359     I32 whilem_c;
 360     SSize_t *last_closep;
 361     regnode_ssc *start_class;
 362 } scan_data_t;
 363
 364 /* The below is perhaps overboard, but this allows us to save a test at the
 365  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 366  * and 'a' differ by a single bit; the same with the upper and lower case of
 367  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 368  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 369  * then inverts it to form a mask, with just a single 0, in the bit position
 370  * where the upper- and lowercase differ.  XXX There are about 40 other
 371  * instances in the Perl core where this micro-optimization could be used.
 372  * Should decide if maintenance cost is worse, before changing those
 373  *
 374  * Returns a boolean as to whether or not 'v' is either a lowercase or
 375  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 376  * compile-time constant, the generated code is better than some optimizing
 377  * compilers figure out, amounting to a mask and test.  The results are
 378  * meaningless if 'c' is not one of [A-Za-z] */
 379 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 380                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 381
 382 /*
 383  * Forward declarations for pregcomp()'s friends.
 384  */
 385
 386 static const scan_data_t zero_scan_data =
 387   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 388
 389 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 390 #define SF_BEFORE_SEOL          0x0001
 391 #define SF_BEFORE_MEOL          0x0002
 392 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 393 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 394
 395 #define SF_FIX_SHIFT_EOL        (+2)
 396 #define SF_FL_SHIFT_EOL         (+4)
 397
 398 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 399 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 400
 401 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 402 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 403 #define SF_IS_INF               0x0040
 404 #define SF_HAS_PAR              0x0080
 405 #define SF_IN_PAR               0x0100
 406 #define SF_HAS_EVAL             0x0200
 407 #define SCF_DO_SUBSTR           0x0400
 408 #define SCF_DO_STCLASS_AND      0x0800
 409 #define SCF_DO_STCLASS_OR       0x1000
 410 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 411 #define SCF_WHILEM_VISITED_POS  0x2000
 412
 413 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 414 #define SCF_SEEN_ACCEPT         0x8000
 415 #define SCF_TRIE_DOING_RESTUDY 0x10000
 416
 417 #define UTF cBOOL(RExC_utf8)
 418
 419 /* The enums for all these are ordered so things work out correctly */
 420 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 421 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 422 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 423 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 424 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 425 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 426 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 427
 428 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 429
 430 #define OOB_NAMEDCLASS          -1
 431
 432 /* There is no code point that is out-of-bounds, so this is problematic.  But
 433  * its only current use is to initialize a variable that is always set before
 434  * looked at. */
 435 #define OOB_UNICODE             0xDEADBEEF
 436
 437 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 438 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 439
 440
 441 /* length of regex to show in messages that don't mark a position within */
 442 #define RegexLengthToShowInErrorMessages 127
 443
 444 /*
 445  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 446  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 447  * op/pragma/warn/regcomp.
 448  */
 449 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 450 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 451
 452 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 453
 454 #define REPORT_LOCATION_ARGS(offset)            \
 455                 UTF8fARG(UTF, offset, RExC_precomp), \
 456                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 457
 458 /*
 459  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 460  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 461  * "...".
 462  */
 463 #define _FAIL(code) STMT_START {                                        \
 464     const char *ellipses = "";                                          \
 465     IV len = RExC_end - RExC_precomp;                                   \
 466                                                                         \
 467     if (!SIZE_ONLY)                                                     \
 468         SAVEFREESV(RExC_rx_sv);                                         \
 469     if (len > RegexLengthToShowInErrorMessages) {                       \
 470         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 471         len = RegexLengthToShowInErrorMessages - 10;                    \
 472         ellipses = "...";                                               \
 473     }                                                                   \
 474     code;                                                               \
 475 } STMT_END
 476
 477 #define FAIL(msg) _FAIL(                            \
 478     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 479             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 480
 481 #define FAIL2(msg,arg) _FAIL(                       \
 482     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 483             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 484
 485 /*
 486  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 487  */
 488 #define Simple_vFAIL(m) STMT_START {                                    \
 489     const IV offset = RExC_parse - RExC_precomp;                        \
 490     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 491             m, REPORT_LOCATION_ARGS(offset));   \
 492 } STMT_END
 493
 494 /*
 495  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 496  */
 497 #define vFAIL(m) STMT_START {                           \
 498     if (!SIZE_ONLY)                                     \
 499         SAVEFREESV(RExC_rx_sv);                         \
 500     Simple_vFAIL(m);                                    \
 501 } STMT_END
 502
 503 /*
 504  * Like Simple_vFAIL(), but accepts two arguments.
 505  */
 506 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 507     const IV offset = RExC_parse - RExC_precomp;                        \
 508     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 509                       REPORT_LOCATION_ARGS(offset));    \
 510 } STMT_END
 511
 512 /*
 513  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 514  */
 515 #define vFAIL2(m,a1) STMT_START {                       \
 516     if (!SIZE_ONLY)                                     \
 517         SAVEFREESV(RExC_rx_sv);                         \
 518     Simple_vFAIL2(m, a1);                               \
 519 } STMT_END
 520
 521
 522 /*
 523  * Like Simple_vFAIL(), but accepts three arguments.
 524  */
 525 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 526     const IV offset = RExC_parse - RExC_precomp;                \
 527     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 528             REPORT_LOCATION_ARGS(offset));      \
 529 } STMT_END
 530
 531 /*
 532  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 533  */
 534 #define vFAIL3(m,a1,a2) STMT_START {                    \
 535     if (!SIZE_ONLY)                                     \
 536         SAVEFREESV(RExC_rx_sv);                         \
 537     Simple_vFAIL3(m, a1, a2);                           \
 538 } STMT_END
 539
 540 /*
 541  * Like Simple_vFAIL(), but accepts four arguments.
 542  */
 543 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 544     const IV offset = RExC_parse - RExC_precomp;                \
 545     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 546             REPORT_LOCATION_ARGS(offset));      \
 547 } STMT_END
 548
 549 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 550     if (!SIZE_ONLY)                                     \
 551         SAVEFREESV(RExC_rx_sv);                         \
 552     Simple_vFAIL4(m, a1, a2, a3);                       \
 553 } STMT_END
 554
 555 /* A specialized version of vFAIL2 that works with UTF8f */
 556 #define vFAIL2utf8f(m, a1) STMT_START { \
 557     const IV offset = RExC_parse - RExC_precomp;   \
 558     if (!SIZE_ONLY)                                \
 559         SAVEFREESV(RExC_rx_sv);                    \
 560     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 561             REPORT_LOCATION_ARGS(offset));         \
 562 } STMT_END
 563
 564
 565 /* m is not necessarily a "literal string", in this macro */
 566 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 567     const IV offset = loc - RExC_precomp;                               \
 568     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 569             m, REPORT_LOCATION_ARGS(offset));       \
 570 } STMT_END
 571
 572 #define ckWARNreg(loc,m) STMT_START {                                   \
 573     const IV offset = loc - RExC_precomp;                               \
 574     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 575             REPORT_LOCATION_ARGS(offset));              \
 576 } STMT_END
 577
 578 #define vWARN_dep(loc, m) STMT_START {                                  \
 579     const IV offset = loc - RExC_precomp;                               \
 580     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 581             REPORT_LOCATION_ARGS(offset));              \
 582 } STMT_END
 583
 584 #define ckWARNdep(loc,m) STMT_START {                                   \
 585     const IV offset = loc - RExC_precomp;                               \
 586     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 587             m REPORT_LOCATION,                                          \
 588             REPORT_LOCATION_ARGS(offset));              \
 589 } STMT_END
 590
 591 #define ckWARNregdep(loc,m) STMT_START {                                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 594             m REPORT_LOCATION,                                          \
 595             REPORT_LOCATION_ARGS(offset));              \
 596 } STMT_END
 597
 598 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 599     const IV offset = loc - RExC_precomp;                               \
 600     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 601             m REPORT_LOCATION,                                          \
 602             a1, REPORT_LOCATION_ARGS(offset));  \
 603 } STMT_END
 604
 605 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 606     const IV offset = loc - RExC_precomp;                               \
 607     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 608             a1, REPORT_LOCATION_ARGS(offset));  \
 609 } STMT_END
 610
 611 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 612     const IV offset = loc - RExC_precomp;                               \
 613     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 614             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 615 } STMT_END
 616
 617 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 618     const IV offset = loc - RExC_precomp;                               \
 619     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 620             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 621 } STMT_END
 622
 623 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 626             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 627 } STMT_END
 628
 629 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 630     const IV offset = loc - RExC_precomp;                               \
 631     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 632             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 633 } STMT_END
 634
 635 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 636     const IV offset = loc - RExC_precomp;                               \
 637     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 638             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 639 } STMT_END
 640
 641
 642 /* Allow for side effects in s */
 643 #define REGC(c,s) STMT_START {                  \
 644     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 645 } STMT_END
 646
 647 /* Macros for recording node offsets.   20001227 mjd@plover.com
 648  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 649  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 650  * Element 0 holds the number n.
 651  * Position is 1 indexed.
 652  */
 653 #ifndef RE_TRACK_PATTERN_OFFSETS
 654 #define Set_Node_Offset_To_R(node,byte)
 655 #define Set_Node_Offset(node,byte)
 656 #define Set_Cur_Node_Offset
 657 #define Set_Node_Length_To_R(node,len)
 658 #define Set_Node_Length(node,len)
 659 #define Set_Node_Cur_Length(node,start)
 660 #define Node_Offset(n)
 661 #define Node_Length(n)
 662 #define Set_Node_Offset_Length(node,offset,len)
 663 #define ProgLen(ri) ri->u.proglen
 664 #define SetProgLen(ri,x) ri->u.proglen = x
 665 #else
 666 #define ProgLen(ri) ri->u.offsets[0]
 667 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 668 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 669     if (! SIZE_ONLY) {                                                  \
 670         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 671                     __LINE__, (int)(node), (int)(byte)));               \
 672         if((node) < 0) {                                                \
 673             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 674         } else {                                                        \
 675             RExC_offsets[2*(node)-1] = (byte);                          \
 676         }                                                               \
 677     }                                                                   \
 678 } STMT_END
 679
 680 #define Set_Node_Offset(node,byte) \
 681     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 682 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 683
 684 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 685     if (! SIZE_ONLY) {                                                  \
 686         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 687                 __LINE__, (int)(node), (int)(len)));                    \
 688         if((node) < 0) {                                                \
 689             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 690         } else {                                                        \
 691             RExC_offsets[2*(node)] = (len);                             \
 692         }                                                               \
 693     }                                                                   \
 694 } STMT_END
 695
 696 #define Set_Node_Length(node,len) \
 697     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 698 #define Set_Node_Cur_Length(node, start)                \
 699     Set_Node_Length(node, RExC_parse - start)
 700
 701 /* Get offsets and lengths */
 702 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 703 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 704
 705 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 706     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 707     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 708 } STMT_END
 709 #endif
 710
 711 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 712 #define EXPERIMENTAL_INPLACESCAN
 713 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 714
 715 #define DEBUG_STUDYDATA(str,data,depth)                              \
 716 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 717     PerlIO_printf(Perl_debug_log,                                    \
 718         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 719         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 720         (int)(depth)*2, "",                                          \
 721         (IV)((data)->pos_min),                                       \
 722         (IV)((data)->pos_delta),                                     \
 723         (UV)((data)->flags),                                         \
 724         (IV)((data)->whilem_c),                                      \
 725         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 726         is_inf ? "INF " : ""                                         \
 727     );                                                               \
 728     if ((data)->last_found)                                          \
 729         PerlIO_printf(Perl_debug_log,                                \
 730             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 731             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 732             SvPVX_const((data)->last_found),                         \
 733             (IV)((data)->last_end),                                  \
 734             (IV)((data)->last_start_min),                            \
 735             (IV)((data)->last_start_max),                            \
 736             ((data)->longest &&                                      \
 737              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 738             SvPVX_const((data)->longest_fixed),                      \
 739             (IV)((data)->offset_fixed),                              \
 740             ((data)->longest &&                                      \
 741              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 742             SvPVX_const((data)->longest_float),                      \
 743             (IV)((data)->offset_float_min),                          \
 744             (IV)((data)->offset_float_max)                           \
 745         );                                                           \
 746     PerlIO_printf(Perl_debug_log,"\n");                              \
 747 });
 748
 749 /* Mark that we cannot extend a found fixed substring at this point.
 750    Update the longest found anchored substring and the longest found
 751    floating substrings if needed. */
 752
 753 STATIC void
 754 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 755                     SSize_t *minlenp, int is_inf)
 756 {
 757     const STRLEN l = CHR_SVLEN(data->last_found);
 758     const STRLEN old_l = CHR_SVLEN(*data->longest);
 759     GET_RE_DEBUG_FLAGS_DECL;
 760
 761     PERL_ARGS_ASSERT_SCAN_COMMIT;
 762
 763     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 764         SvSetMagicSV(*data->longest, data->last_found);
 765         if (*data->longest == data->longest_fixed) {
 766             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 767             if (data->flags & SF_BEFORE_EOL)
 768                 data->flags
 769                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 770             else
 771                 data->flags &= ~SF_FIX_BEFORE_EOL;
 772             data->minlen_fixed=minlenp;
 773             data->lookbehind_fixed=0;
 774         }
 775         else { /* *data->longest == data->longest_float */
 776             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 777             data->offset_float_max = (l
 778                                       ? data->last_start_max
 779                                       : (data->pos_delta == SSize_t_MAX
 780                                          ? SSize_t_MAX
 781                                          : data->pos_min + data->pos_delta));
 782             if (is_inf
 783                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 784                 data->offset_float_max = SSize_t_MAX;
 785             if (data->flags & SF_BEFORE_EOL)
 786                 data->flags
 787                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 788             else
 789                 data->flags &= ~SF_FL_BEFORE_EOL;
 790             data->minlen_float=minlenp;
 791             data->lookbehind_float=0;
 792         }
 793     }
 794     SvCUR_set(data->last_found, 0);
 795     {
 796         SV * const sv = data->last_found;
 797         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 798             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 799             if (mg)
 800                 mg->mg_len = 0;
 801         }
 802     }
 803     data->last_end = -1;
 804     data->flags &= ~SF_BEFORE_EOL;
 805     DEBUG_STUDYDATA("commit: ",data,0);
 806 }
 807
 808 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 809  * list that describes which code points it matches */
 810
 811 STATIC void
 812 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 813 {
 814     /* Set the SSC 'ssc' to match an empty string or any code point */
 815
 816     PERL_ARGS_ASSERT_SSC_ANYTHING;
 817
 818     assert(OP(ssc) == ANYOF_SYNTHETIC);
 819
 820     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 821     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 822     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 823 }
 824
 825 STATIC int
 826 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 827 {
 828     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 829      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 830      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 831      * in any way, so there's no point in using it */
 832
 833     UV start, end;
 834     bool ret;
 835
 836     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 837
 838     assert(OP(ssc) == ANYOF_SYNTHETIC);
 839
 840     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 841         return FALSE;
 842     }
 843
 844     /* See if the list consists solely of the range 0 - Infinity */
 845     invlist_iterinit(ssc->invlist);
 846     ret = invlist_iternext(ssc->invlist, &start, &end)
 847           && start == 0
 848           && end == UV_MAX;
 849
 850     invlist_iterfinish(ssc->invlist);
 851
 852     if (ret) {
 853         return TRUE;
 854     }
 855
 856     /* If e.g., both \w and \W are set, matches everything */
 857     if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
 858         int i;
 859         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 860             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 861                 return TRUE;
 862             }
 863         }
 864     }
 865
 866     return FALSE;
 867 }
 868
 869 STATIC void
 870 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 871 {
 872     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 873      * string, any code point, or any posix class under locale */
 874
 875     PERL_ARGS_ASSERT_SSC_INIT;
 876
 877     Zero(ssc, 1, regnode_ssc);
 878     OP(ssc) = ANYOF_SYNTHETIC;
 879     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 880     ssc_anything(ssc);
 881
 882     /* If any portion of the regex is to operate under locale rules,
 883      * initialization includes it.  The reason this isn't done for all regexes
 884      * is that the optimizer was written under the assumption that locale was
 885      * all-or-nothing.  Given the complexity and lack of documentation in the
 886      * optimizer, and that there are inadequate test cases for locale, many
 887      * parts of it may not work properly, it is safest to avoid locale unless
 888      * necessary. */
 889     if (RExC_contains_locale) {
 890         ANYOF_POSIXL_SETALL(ssc);
 891         ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
 892         if (RExC_contains_i) {
 893             ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
 894         }
 895     }
 896     else {
 897         ANYOF_POSIXL_ZERO(ssc);
 898     }
 899 }
 900
 901 STATIC int
 902 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 903                               const regnode_ssc *ssc)
 904 {
 905     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 906      * to the list of code points matched, and locale posix classes; hence does
 907      * not check its flags) */
 908
 909     UV start, end;
 910     bool ret;
 911
 912     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 913
 914     assert(OP(ssc) == ANYOF_SYNTHETIC);
 915
 916     invlist_iterinit(ssc->invlist);
 917     ret = invlist_iternext(ssc->invlist, &start, &end)
 918           && start == 0
 919           && end == UV_MAX;
 920
 921     invlist_iterfinish(ssc->invlist);
 922
 923     if (! ret) {
 924         return FALSE;
 925     }
 926
 927     if (RExC_contains_locale) {
 928         if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
 929             || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
 930             || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
 931         {
 932             return FALSE;
 933         }
 934         if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
 935             return FALSE;
 936         }
 937     }
 938
 939     return TRUE;
 940 }
 941
 942 STATIC SV*
 943 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 944                                   const regnode_charclass_posixl* const node)
 945 {
 946     /* Returns a mortal inversion list defining which code points are matched
 947      * by 'node', which is of type ANYOF.  Handles complementing the result if
 948      * appropriate.  If some code points aren't knowable at this time, the
 949      * returned list must, and will, contain every possible code point. */
 950
 951     SV* invlist = sv_2mortal(_new_invlist(0));
 952     unsigned int i;
 953     const U32 n = ARG(node);
 954
 955     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 956
 957     /* Look at the data structure created by S_set_ANYOF_arg() */
 958     if (n != ANYOF_NONBITMAP_EMPTY) {
 959         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
 960         AV * const av = MUTABLE_AV(SvRV(rv));
 961         SV **const ary = AvARRAY(av);
 962         assert(RExC_rxi->data->what[n] == 's');
 963
 964         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
 965             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
 966         }
 967         else if (ary[0] && ary[0] != &PL_sv_undef) {
 968
 969             /* Here, no compile-time swash, and there are things that won't be
 970              * known until runtime -- we have to assume it could be anything */
 971             return _add_range_to_invlist(invlist, 0, UV_MAX);
 972         }
 973         else {
 974
 975             /* Here no compile-time swash, and no run-time only data.  Use the
 976              * node's inversion list */
 977             invlist = sv_2mortal(invlist_clone(ary[2]));
 978         }
 979     }
 980
 981     /* An ANYOF node contains a bitmap for the first 256 code points, and an
 982      * inversion list for the others, but if there are code points that should
 983      * match only conditionally on the target string being UTF-8, those are
 984      * placed in the inversion list, and not the bitmap.  Since there are
 985      * circumstances under which they could match, they are included in the
 986      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
 987      * here, so that when we invert below, the end result actually does include
 988      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
 989      * before we add the unconditionally matched code points */
 990     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
 991         _invlist_intersection_complement_2nd(invlist,
 992                                              PL_UpperLatin1,
 993                                              &invlist);
 994     }
 995
 996     /* Add in the points from the bit map */
 997     for (i = 0; i < 256; i++) {
 998         if (ANYOF_BITMAP_TEST(node, i)) {
 999             invlist = add_cp_to_invlist(invlist, i);
1000         }
1001     }
1002
1003     /* If this can match all upper Latin1 code points, have to add them
1004      * as well */
1005     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1006         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1007     }
1008
1009     /* Similarly for these */
1010     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1011         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1012     }
1013
1014     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1015         _invlist_invert(invlist);
1016     }
1017
1018     return invlist;
1019 }
1020
1021 /* These two functions currently do the exact same thing */
1022 #define ssc_init_zero           ssc_init
1023
1024 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1025 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1026
1027 STATIC void
1028 S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1029 {
1030     /* Take the flags 'and_with' and accumulate them anded into the flags for
1031      * the SSC 'ssc'.  The non-SSC related flags in 'and_with' are ignored.
1032      * The flags 'and_with' should not come from another SSC (otherwise the
1033      * EMPTY_STRING flag won't work) */
1034
1035     const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1036
1037     PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1038
1039     /* Use just the SSC-related flags from 'and_with' */
1040     ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1041     ANYOF_FLAGS(ssc) |= ssc_only_flags;
1042 }
1043
1044 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1045  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1046  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1047
1048 STATIC void
1049 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1050                 const regnode_ssc *and_with)
1051 {
1052     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1053      * another SSC or a regular ANYOF class.  Can create false positives. */
1054
1055     SV* anded_cp_list;
1056     U8  anded_flags;
1057
1058     PERL_ARGS_ASSERT_SSC_AND;
1059
1060     assert(OP(ssc) == ANYOF_SYNTHETIC);
1061
1062     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1063      * the code point inversion list and just the relevant flags */
1064     if (OP(and_with) == ANYOF_SYNTHETIC) {
1065         anded_cp_list = and_with->invlist;
1066         anded_flags = ANYOF_FLAGS(and_with);
1067     }
1068     else {
1069         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1070                                         (regnode_charclass_posixl*) and_with);
1071         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1072     }
1073
1074     ANYOF_FLAGS(ssc) &= anded_flags;
1075
1076     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1077      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1078      * 'and_with' may be inverted.  When not inverted, we have the situation of
1079      * computing:
1080      *  (C1 | P1) & (C2 | P2)
1081      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1082      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1083      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1084      *                    <=  ((C1 & C2) | P1 | P2)
1085      * Alternatively, the last few steps could be:
1086      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1087      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1088      *                    <=  (C1 | C2 | (P1 & P2))
1089      * We favor the second approach if either P1 or P2 is non-empty.  This is
1090      * because these components are a barrier to doing optimizations, as what
1091      * they match cannot be known until the moment of matching as they are
1092      * dependent on the current locale, 'AND"ing them likely will reduce or
1093      * eliminate them.
1094      * But we can do better if we know that C1,P1 are in their initial state (a
1095      * frequent occurrence), each matching everything:
1096      *  (<everything>) & (C2 | P2) =  C2 | P2
1097      * Similarly, if C2,P2 are in their initial state (again a frequent
1098      * occurrence), the result is a no-op
1099      *  (C1 | P1) & (<everything>) =  C1 | P1
1100      *
1101      * Inverted, we have
1102      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1103      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1104      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1105      * */
1106
1107     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1108         && OP(and_with) != ANYOF_SYNTHETIC)
1109     {
1110         unsigned int i;
1111
1112         ssc_intersection(ssc,
1113                          anded_cp_list,
1114                          FALSE /* Has already been inverted */
1115                          );
1116
1117         /* If either P1 or P2 is empty, the intersection will be also; can skip
1118          * the loop */
1119         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1120             ANYOF_POSIXL_ZERO(ssc);
1121         }
1122         else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1123
1124             /* Note that the Posix class component P from 'and_with' actually
1125              * looks like:
1126              *      P = Pa | Pb | ... | Pn
1127              * where each component is one posix class, such as in [\w\s].
1128              * Thus
1129              *      ~P = ~(Pa | Pb | ... | Pn)
1130              *         = ~Pa & ~Pb & ... & ~Pn
1131              *        <= ~Pa | ~Pb | ... | ~Pn
1132              * The last is something we can easily calculate, but unfortunately
1133              * is likely to have many false positives.  We could do better
1134              * in some (but certainly not all) instances if two classes in
1135              * P have known relationships.  For example
1136              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1137              * So
1138              *      :lower: & :print: = :lower:
1139              * And similarly for classes that must be disjoint.  For example,
1140              * since \s and \w can have no elements in common based on rules in
1141              * the POSIX standard,
1142              *      \w & ^\S = nothing
1143              * Unfortunately, some vendor locales do not meet the Posix
1144              * standard, in particular almost everything by Microsoft.
1145              * The loop below just changes e.g., \w into \W and vice versa */
1146
1147             regnode_charclass_posixl temp;
1148             int add = 1;    /* To calculate the index of the complement */
1149
1150             ANYOF_POSIXL_ZERO(&temp);
1151             for (i = 0; i < ANYOF_MAX; i++) {
1152                 assert(i % 2 != 0
1153                        || ! ANYOF_POSIXL_TEST(and_with, i)
1154                        || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1155
1156                 if (ANYOF_POSIXL_TEST(and_with, i)) {
1157                     ANYOF_POSIXL_SET(&temp, i + add);
1158                 }
1159                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1160             }
1161             ANYOF_POSIXL_AND(&temp, ssc);
1162
1163         } /* else ssc already has no posixes */
1164     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1165          in its initial state */
1166     else if (OP(and_with) != ANYOF_SYNTHETIC
1167              || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1168     {
1169         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1170          * copy it over 'ssc' */
1171         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1172             if (OP(and_with) == ANYOF_SYNTHETIC) {
1173                 StructCopy(and_with, ssc, regnode_ssc);
1174             }
1175             else {
1176                 ssc->invlist = anded_cp_list;
1177                 ANYOF_POSIXL_ZERO(ssc);
1178                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1179                     ANYOF_POSIXL_OR(and_with, ssc);
1180                 }
1181             }
1182         }
1183         else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1184                     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1185         {
1186             /* One or the other of P1, P2 is non-empty. */
1187             ANYOF_POSIXL_AND(and_with, ssc);
1188             ssc_union(ssc, anded_cp_list, FALSE);
1189         }
1190         else { /* P1 = P2 = empty */
1191             ssc_intersection(ssc, anded_cp_list, FALSE);
1192         }
1193     }
1194 }
1195
1196 STATIC void
1197 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1198                const regnode_ssc *or_with)
1199 {
1200     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1201      * another SSC or a regular ANYOF class.  Can create false positives if
1202      * 'or_with' is to be inverted. */
1203
1204     SV* ored_cp_list;
1205     U8 ored_flags;
1206
1207     PERL_ARGS_ASSERT_SSC_OR;
1208
1209     assert(OP(ssc) == ANYOF_SYNTHETIC);
1210
1211     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1212      * the code point inversion list and just the relevant flags */
1213     if (OP(or_with) == ANYOF_SYNTHETIC) {
1214         ored_cp_list = or_with->invlist;
1215         ored_flags = ANYOF_FLAGS(or_with);
1216     }
1217     else {
1218         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1219                                         (regnode_charclass_posixl*) or_with);
1220         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1221     }
1222
1223     ANYOF_FLAGS(ssc) |= ored_flags;
1224
1225     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1226      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1227      * 'or_with' may be inverted.  When not inverted, we have the simple
1228      * situation of computing:
1229      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1230      * If P1|P2 yields a situation with both a class and its complement are
1231      * set, like having both \w and \W, this matches all code points, and we
1232      * can delete these from the P component of the ssc going forward.  XXX We
1233      * might be able to delete all the P components, but I (khw) am not certain
1234      * about this, and it is better to be safe.
1235      *
1236      * Inverted, we have
1237      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1238      *                         <=  (C1 | P1) | ~C2
1239      *                         <=  (C1 | ~C2) | P1
1240      * (which results in actually simpler code than the non-inverted case)
1241      * */
1242
1243     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1244         && OP(or_with) != ANYOF_SYNTHETIC)
1245     {
1246         /* We ignore P2, leaving P1 going forward */
1247     }
1248     else {  /* Not inverted */
1249         ANYOF_POSIXL_OR(or_with, ssc);
1250         if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1251             unsigned int i;
1252             for (i = 0; i < ANYOF_MAX; i += 2) {
1253                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1254                 {
1255                     ssc_match_all_cp(ssc);
1256                     ANYOF_POSIXL_CLEAR(ssc, i);
1257                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1258                     if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1259                         ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1260                     }
1261                 }
1262             }
1263         }
1264     }
1265
1266     ssc_union(ssc,
1267               ored_cp_list,
1268               FALSE /* Already has been inverted */
1269               );
1270 }
1271
1272 PERL_STATIC_INLINE void
1273 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1274 {
1275     PERL_ARGS_ASSERT_SSC_UNION;
1276
1277     assert(OP(ssc) == ANYOF_SYNTHETIC);
1278
1279     _invlist_union_maybe_complement_2nd(ssc->invlist,
1280                                         invlist,
1281                                         invert2nd,
1282                                         &ssc->invlist);
1283 }
1284
1285 PERL_STATIC_INLINE void
1286 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1287                          SV* const invlist,
1288                          const bool invert2nd)
1289 {
1290     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1291
1292     assert(OP(ssc) == ANYOF_SYNTHETIC);
1293
1294     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1295                                                invlist,
1296                                                invert2nd,
1297                                                &ssc->invlist);
1298 }
1299
1300 PERL_STATIC_INLINE void
1301 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1302 {
1303     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1304
1305     assert(OP(ssc) == ANYOF_SYNTHETIC);
1306
1307     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1308 }
1309
1310 PERL_STATIC_INLINE void
1311 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1312 {
1313     /* AND just the single code point 'cp' into the SSC 'ssc' */
1314
1315     SV* cp_list = _new_invlist(2);
1316
1317     PERL_ARGS_ASSERT_SSC_CP_AND;
1318
1319     assert(OP(ssc) == ANYOF_SYNTHETIC);
1320
1321     cp_list = add_cp_to_invlist(cp_list, cp);
1322     ssc_intersection(ssc, cp_list,
1323                      FALSE /* Not inverted */
1324                      );
1325     SvREFCNT_dec_NN(cp_list);
1326 }
1327
1328 PERL_STATIC_INLINE void
1329 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1330 {
1331     /* Set the SSC 'ssc' to not match any locale things */
1332
1333     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1334
1335     assert(OP(ssc) == ANYOF_SYNTHETIC);
1336
1337     ANYOF_POSIXL_ZERO(ssc);
1338     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1339 }
1340
1341 STATIC void
1342 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1343 {
1344     /* The inversion list in the SSC is marked mortal; now we need a more
1345      * permanent copy, which is stored the same way that is done in a regular
1346      * ANYOF node, with the first 256 code points in a bit map */
1347
1348     SV* invlist = invlist_clone(ssc->invlist);
1349
1350     PERL_ARGS_ASSERT_SSC_FINALIZE;
1351
1352     assert(OP(ssc) == ANYOF_SYNTHETIC);
1353
1354     /* The code in this file assumes that all but these flags aren't relevant
1355      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1356      * time we reach here */
1357     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1358
1359     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1360
1361     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1362
1363     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1364 }
1365
1366 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1367 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1368 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1369 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1370
1371
1372 #ifdef DEBUGGING
1373 /*
1374    dump_trie(trie,widecharmap,revcharmap)
1375    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1376    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1377
1378    These routines dump out a trie in a somewhat readable format.
1379    The _interim_ variants are used for debugging the interim
1380    tables that are used to generate the final compressed
1381    representation which is what dump_trie expects.
1382
1383    Part of the reason for their existence is to provide a form
1384    of documentation as to how the different representations function.
1385
1386 */
1387
1388 /*
1389   Dumps the final compressed table form of the trie to Perl_debug_log.
1390   Used for debugging make_trie().
1391 */
1392
1393 STATIC void
1394 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1395             AV *revcharmap, U32 depth)
1396 {
1397     U32 state;
1398     SV *sv=sv_newmortal();
1399     int colwidth= widecharmap ? 6 : 4;
1400     U16 word;
1401     GET_RE_DEBUG_FLAGS_DECL;
1402
1403     PERL_ARGS_ASSERT_DUMP_TRIE;
1404
1405     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1406         (int)depth * 2 + 2,"",
1407         "Match","Base","Ofs" );
1408
1409     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1410         SV ** const tmp = av_fetch( revcharmap, state, 0);
1411         if ( tmp ) {
1412             PerlIO_printf( Perl_debug_log, "%*s",
1413                 colwidth,
1414                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1415                             PL_colors[0], PL_colors[1],
1416                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1417                             PERL_PV_ESCAPE_FIRSTCHAR
1418                 )
1419             );
1420         }
1421     }
1422     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1423         (int)depth * 2 + 2,"");
1424
1425     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1426         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1427     PerlIO_printf( Perl_debug_log, "\n");
1428
1429     for( state = 1 ; state < trie->statecount ; state++ ) {
1430         const U32 base = trie->states[ state ].trans.base;
1431
1432         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1433
1434         if ( trie->states[ state ].wordnum ) {
1435             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1436         } else {
1437             PerlIO_printf( Perl_debug_log, "%6s", "" );
1438         }
1439
1440         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1441
1442         if ( base ) {
1443             U32 ofs = 0;
1444
1445             while( ( base + ofs  < trie->uniquecharcount ) ||
1446                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1447                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1448                     ofs++;
1449
1450             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1451
1452             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1453                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1454                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1455                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1456                 {
1457                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1458                     colwidth,
1459                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1460                 } else {
1461                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1462                 }
1463             }
1464
1465             PerlIO_printf( Perl_debug_log, "]");
1466
1467         }
1468         PerlIO_printf( Perl_debug_log, "\n" );
1469     }
1470     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1471     for (word=1; word <= trie->wordcount; word++) {
1472         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1473             (int)word, (int)(trie->wordinfo[word].prev),
1474             (int)(trie->wordinfo[word].len));
1475     }
1476     PerlIO_printf(Perl_debug_log, "\n" );
1477 }
1478 /*
1479   Dumps a fully constructed but uncompressed trie in list form.
1480   List tries normally only are used for construction when the number of
1481   possible chars (trie->uniquecharcount) is very high.
1482   Used for debugging make_trie().
1483 */
1484 STATIC void
1485 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1486                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1487                          U32 depth)
1488 {
1489     U32 state;
1490     SV *sv=sv_newmortal();
1491     int colwidth= widecharmap ? 6 : 4;
1492     GET_RE_DEBUG_FLAGS_DECL;
1493
1494     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1495
1496     /* print out the table precompression.  */
1497     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1498         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1499         "------:-----+-----------------\n" );
1500
1501     for( state=1 ; state < next_alloc ; state ++ ) {
1502         U16 charid;
1503
1504         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1505             (int)depth * 2 + 2,"", (UV)state  );
1506         if ( ! trie->states[ state ].wordnum ) {
1507             PerlIO_printf( Perl_debug_log, "%5s| ","");
1508         } else {
1509             PerlIO_printf( Perl_debug_log, "W%4x| ",
1510                 trie->states[ state ].wordnum
1511             );
1512         }
1513         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1514             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1515             if ( tmp ) {
1516                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1517                     colwidth,
1518                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1519                             PL_colors[0], PL_colors[1],
1520                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1521                             PERL_PV_ESCAPE_FIRSTCHAR
1522                     ) ,
1523                     TRIE_LIST_ITEM(state,charid).forid,
1524                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1525                 );
1526                 if (!(charid % 10))
1527                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1528                         (int)((depth * 2) + 14), "");
1529             }
1530         }
1531         PerlIO_printf( Perl_debug_log, "\n");
1532     }
1533 }
1534
1535 /*
1536   Dumps a fully constructed but uncompressed trie in table form.
1537   This is the normal DFA style state transition table, with a few
1538   twists to facilitate compression later.
1539   Used for debugging make_trie().
1540 */
1541 STATIC void
1542 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1543                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1544                           U32 depth)
1545 {
1546     U32 state;
1547     U16 charid;
1548     SV *sv=sv_newmortal();
1549     int colwidth= widecharmap ? 6 : 4;
1550     GET_RE_DEBUG_FLAGS_DECL;
1551
1552     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1553
1554     /*
1555        print out the table precompression so that we can do a visual check
1556        that they are identical.
1557      */
1558
1559     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1560
1561     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1562         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1563         if ( tmp ) {
1564             PerlIO_printf( Perl_debug_log, "%*s",
1565                 colwidth,
1566                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1567                             PL_colors[0], PL_colors[1],
1568                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1569                             PERL_PV_ESCAPE_FIRSTCHAR
1570                 )
1571             );
1572         }
1573     }
1574
1575     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1576
1577     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1578         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1579     }
1580
1581     PerlIO_printf( Perl_debug_log, "\n" );
1582
1583     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1584
1585         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1586             (int)depth * 2 + 2,"",
1587             (UV)TRIE_NODENUM( state ) );
1588
1589         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1590             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1591             if (v)
1592                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1593             else
1594                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1595         }
1596         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1597             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1598         } else {
1599             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1600             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1601         }
1602     }
1603 }
1604
1605 #endif
1606
1607
1608 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1609   startbranch: the first branch in the whole branch sequence
1610   first      : start branch of sequence of branch-exact nodes.
1611                May be the same as startbranch
1612   last       : Thing following the last branch.
1613                May be the same as tail.
1614   tail       : item following the branch sequence
1615   count      : words in the sequence
1616   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1617   depth      : indent depth
1618
1619 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1620
1621 A trie is an N'ary tree where the branches are determined by digital
1622 decomposition of the key. IE, at the root node you look up the 1st character and
1623 follow that branch repeat until you find the end of the branches. Nodes can be
1624 marked as "accepting" meaning they represent a complete word. Eg:
1625
1626   /he|she|his|hers/
1627
1628 would convert into the following structure. Numbers represent states, letters
1629 following numbers represent valid transitions on the letter from that state, if
1630 the number is in square brackets it represents an accepting state, otherwise it
1631 will be in parenthesis.
1632
1633       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1634       |    |
1635       |   (2)
1636       |    |
1637      (1)   +-i->(6)-+-s->[7]
1638       |
1639       +-s->(3)-+-h->(4)-+-e->[5]
1640
1641       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1642
1643 This shows that when matching against the string 'hers' we will begin at state 1
1644 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1645 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1646 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1647 single traverse. We store a mapping from accepting to state to which word was
1648 matched, and then when we have multiple possibilities we try to complete the
1649 rest of the regex in the order in which they occured in the alternation.
1650
1651 The only prior NFA like behaviour that would be changed by the TRIE support is
1652 the silent ignoring of duplicate alternations which are of the form:
1653
1654  / (DUPE|DUPE) X? (?{ ... }) Y /x
1655
1656 Thus EVAL blocks following a trie may be called a different number of times with
1657 and without the optimisation. With the optimisations dupes will be silently
1658 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1659 the following demonstrates:
1660
1661  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1662
1663 which prints out 'word' three times, but
1664
1665  'words'=~/(word|word|word)(?{ print $1 })S/
1666
1667 which doesnt print it out at all. This is due to other optimisations kicking in.
1668
1669 Example of what happens on a structural level:
1670
1671 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1672
1673    1: CURLYM[1] {1,32767}(18)
1674    5:   BRANCH(8)
1675    6:     EXACT <ac>(16)
1676    8:   BRANCH(11)
1677    9:     EXACT <ad>(16)
1678   11:   BRANCH(14)
1679   12:     EXACT <ab>(16)
1680   16:   SUCCEED(0)
1681   17:   NOTHING(18)
1682   18: END(0)
1683
1684 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1685 and should turn into:
1686
1687    1: CURLYM[1] {1,32767}(18)
1688    5:   TRIE(16)
1689         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1690           <ac>
1691           <ad>
1692           <ab>
1693   16:   SUCCEED(0)
1694   17:   NOTHING(18)
1695   18: END(0)
1696
1697 Cases where tail != last would be like /(?foo|bar)baz/:
1698
1699    1: BRANCH(4)
1700    2:   EXACT <foo>(8)
1701    4: BRANCH(7)
1702    5:   EXACT <bar>(8)
1703    7: TAIL(8)
1704    8: EXACT <baz>(10)
1705   10: END(0)
1706
1707 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1708 and would end up looking like:
1709
1710     1: TRIE(8)
1711       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1712         <foo>
1713         <bar>
1714    7: TAIL(8)
1715    8: EXACT <baz>(10)
1716   10: END(0)
1717
1718     d = uvchr_to_utf8_flags(d, uv, 0);
1719
1720 is the recommended Unicode-aware way of saying
1721
1722     *(d++) = uv;
1723 */
1724
1725 #define TRIE_STORE_REVCHAR(val)                                            \
1726     STMT_START {                                                           \
1727         if (UTF) {                                                         \
1728             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1729             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1730             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1731             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1732             SvPOK_on(zlopp);                                               \
1733             SvUTF8_on(zlopp);                                              \
1734             av_push(revcharmap, zlopp);                                    \
1735         } else {                                                           \
1736             char ooooff = (char)val;                                           \
1737             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1738         }                                                                  \
1739         } STMT_END
1740
1741 /* This gets the next character from the input, folding it if not already
1742  * folded. */
1743 #define TRIE_READ_CHAR STMT_START {                                           \
1744     wordlen++;                                                                \
1745     if ( UTF ) {                                                              \
1746         /* if it is UTF then it is either already folded, or does not need    \
1747          * folding */                                                         \
1748         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1749     }                                                                         \
1750     else if (folder == PL_fold_latin1) {                                      \
1751         /* This folder implies Unicode rules, which in the range expressible  \
1752          *  by not UTF is the lower case, with the two exceptions, one of     \
1753          *  which should have been taken care of before calling this */       \
1754         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1755         uvc = toLOWER_L1(*uc);                                                \
1756         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1757         len = 1;                                                              \
1758     } else {                                                                  \
1759         /* raw data, will be folded later if needed */                        \
1760         uvc = (U32)*uc;                                                       \
1761         len = 1;                                                              \
1762     }                                                                         \
1763 } STMT_END
1764
1765
1766
1767 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1768     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1769         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1770         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1771     }                                                           \
1772     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1773     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1774     TRIE_LIST_CUR( state )++;                                   \
1775 } STMT_END
1776
1777 #define TRIE_LIST_NEW(state) STMT_START {                       \
1778     Newxz( trie->states[ state ].trans.list,               \
1779         4, reg_trie_trans_le );                                 \
1780      TRIE_LIST_CUR( state ) = 1;                                \
1781      TRIE_LIST_LEN( state ) = 4;                                \
1782 } STMT_END
1783
1784 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1785     U16 dupe= trie->states[ state ].wordnum;                    \
1786     regnode * const noper_next = regnext( noper );              \
1787                                                                 \
1788     DEBUG_r({                                                   \
1789         /* store the word for dumping */                        \
1790         SV* tmp;                                                \
1791         if (OP(noper) != NOTHING)                               \
1792             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1793         else                                                    \
1794             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1795         av_push( trie_words, tmp );                             \
1796     });                                                         \
1797                                                                 \
1798     curword++;                                                  \
1799     trie->wordinfo[curword].prev   = 0;                         \
1800     trie->wordinfo[curword].len    = wordlen;                   \
1801     trie->wordinfo[curword].accept = state;                     \
1802                                                                 \
1803     if ( noper_next < tail ) {                                  \
1804         if (!trie->jump)                                        \
1805             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1806         trie->jump[curword] = (U16)(noper_next - convert);      \
1807         if (!jumper)                                            \
1808             jumper = noper_next;                                \
1809         if (!nextbranch)                                        \
1810             nextbranch= regnext(cur);                           \
1811     }                                                           \
1812                                                                 \
1813     if ( dupe ) {                                               \
1814         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1815         /* chain, so that when the bits of chain are later    */\
1816         /* linked together, the dups appear in the chain      */\
1817         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1818         trie->wordinfo[dupe].prev = curword;                    \
1819     } else {                                                    \
1820         /* we haven't inserted this word yet.                */ \
1821         trie->states[ state ].wordnum = curword;                \
1822     }                                                           \
1823 } STMT_END
1824
1825
1826 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1827      ( ( base + charid >=  ucharcount                                   \
1828          && base + charid < ubound                                      \
1829          && state == trie->trans[ base - ucharcount + charid ].check    \
1830          && trie->trans[ base - ucharcount + charid ].next )            \
1831            ? trie->trans[ base - ucharcount + charid ].next             \
1832            : ( state==1 ? special : 0 )                                 \
1833       )
1834
1835 #define MADE_TRIE       1
1836 #define MADE_JUMP_TRIE  2
1837 #define MADE_EXACT_TRIE 4
1838
1839 STATIC I32
1840 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1841 {
1842     dVAR;
1843     /* first pass, loop through and scan words */
1844     reg_trie_data *trie;
1845     HV *widecharmap = NULL;
1846     AV *revcharmap = newAV();
1847     regnode *cur;
1848     STRLEN len = 0;
1849     UV uvc = 0;
1850     U16 curword = 0;
1851     U32 next_alloc = 0;
1852     regnode *jumper = NULL;
1853     regnode *nextbranch = NULL;
1854     regnode *convert = NULL;
1855     U32 *prev_states; /* temp array mapping each state to previous one */
1856     /* we just use folder as a flag in utf8 */
1857     const U8 * folder = NULL;
1858
1859 #ifdef DEBUGGING
1860     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1861     AV *trie_words = NULL;
1862     /* along with revcharmap, this only used during construction but both are
1863      * useful during debugging so we store them in the struct when debugging.
1864      */
1865 #else
1866     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1867     STRLEN trie_charcount=0;
1868 #endif
1869     SV *re_trie_maxbuff;
1870     GET_RE_DEBUG_FLAGS_DECL;
1871
1872     PERL_ARGS_ASSERT_MAKE_TRIE;
1873 #ifndef DEBUGGING
1874     PERL_UNUSED_ARG(depth);
1875 #endif
1876
1877     switch (flags) {
1878         case EXACT: break;
1879         case EXACTFA:
1880         case EXACTFU_SS:
1881         case EXACTFU: folder = PL_fold_latin1; break;
1882         case EXACTF:  folder = PL_fold; break;
1883         case EXACTFL: folder = PL_fold_locale; break;
1884         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1885     }
1886
1887     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1888     trie->refcount = 1;
1889     trie->startstate = 1;
1890     trie->wordcount = word_count;
1891     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1892     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1893     if (flags == EXACT)
1894         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1895     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1896                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1897
1898     DEBUG_r({
1899         trie_words = newAV();
1900     });
1901
1902     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1903     if (!SvIOK(re_trie_maxbuff)) {
1904         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1905     }
1906     DEBUG_TRIE_COMPILE_r({
1907                 PerlIO_printf( Perl_debug_log,
1908                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1909                   (int)depth * 2 + 2, "",
1910                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1911                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1912                   (int)depth);
1913     });
1914
1915    /* Find the node we are going to overwrite */
1916     if ( first == startbranch && OP( last ) != BRANCH ) {
1917         /* whole branch chain */
1918         convert = first;
1919     } else {
1920         /* branch sub-chain */
1921         convert = NEXTOPER( first );
1922     }
1923
1924     /*  -- First loop and Setup --
1925
1926        We first traverse the branches and scan each word to determine if it
1927        contains widechars, and how many unique chars there are, this is
1928        important as we have to build a table with at least as many columns as we
1929        have unique chars.
1930
1931        We use an array of integers to represent the character codes 0..255
1932        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1933        native representation of the character value as the key and IV's for the
1934        coded index.
1935
1936        *TODO* If we keep track of how many times each character is used we can
1937        remap the columns so that the table compression later on is more
1938        efficient in terms of memory by ensuring the most common value is in the
1939        middle and the least common are on the outside.  IMO this would be better
1940        than a most to least common mapping as theres a decent chance the most
1941        common letter will share a node with the least common, meaning the node
1942        will not be compressible. With a middle is most common approach the worst
1943        case is when we have the least common nodes twice.
1944
1945      */
1946
1947     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1948         regnode *noper = NEXTOPER( cur );
1949         const U8 *uc = (U8*)STRING( noper );
1950         const U8 *e  = uc + STR_LEN( noper );
1951         STRLEN foldlen = 0;
1952         U32 wordlen      = 0;         /* required init */
1953         STRLEN minbytes = 0;
1954         STRLEN maxbytes = 0;
1955         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1956
1957         if (OP(noper) == NOTHING) {
1958             regnode *noper_next= regnext(noper);
1959             if (noper_next != tail && OP(noper_next) == flags) {
1960                 noper = noper_next;
1961                 uc= (U8*)STRING(noper);
1962                 e= uc + STR_LEN(noper);
1963                 trie->minlen= STR_LEN(noper);
1964             } else {
1965                 trie->minlen= 0;
1966                 continue;
1967             }
1968         }
1969
1970         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1971             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1972                                           regardless of encoding */
1973             if (OP( noper ) == EXACTFU_SS) {
1974                 /* false positives are ok, so just set this */
1975                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1976             }
1977         }
1978         for ( ; uc < e ; uc += len ) {
1979             TRIE_CHARCOUNT(trie)++;
1980             TRIE_READ_CHAR;
1981
1982             /* Acummulate to the current values, the range in the number of
1983              * bytes that this character could match.  The max is presumed to
1984              * be the same as the folded input (which TRIE_READ_CHAR returns),
1985              * except that when this is not in UTF-8, it could be matched
1986              * against a string which is UTF-8, and the variant characters
1987              * could be 2 bytes instead of the 1 here.  Likewise, for the
1988              * minimum number of bytes when not folded.  When folding, the min
1989              * is assumed to be 1 byte could fold to match the single character
1990              * here, or in the case of a multi-char fold, 1 byte can fold to
1991              * the whole sequence.  'foldlen' is used to denote whether we are
1992              * in such a sequence, skipping the min setting if so.  XXX TODO
1993              * Use the exact list of what folds to each character, from
1994              * PL_utf8_foldclosures */
1995             if (UTF) {
1996                 maxbytes += UTF8SKIP(uc);
1997                 if (! folder) {
1998                     /* A non-UTF-8 string could be 1 byte to match our 2 */
1999                     minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2000                                 ? 1
2001                                 : UTF8SKIP(uc);
2002                 }
2003                 else {
2004                     if (foldlen) {
2005                         foldlen -= UTF8SKIP(uc);
2006                     }
2007                     else {
2008                         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2009                         minbytes++;
2010                     }
2011                 }
2012             }
2013             else {
2014                 maxbytes += (UNI_IS_INVARIANT(*uc))
2015                              ? 1
2016                              : 2;
2017                 if (! folder) {
2018                     minbytes++;
2019                 }
2020                 else {
2021                     if (foldlen) {
2022                         foldlen--;
2023                     }
2024                     else {
2025                         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2026                         minbytes++;
2027                     }
2028                 }
2029             }
2030             if ( uvc < 256 ) {
2031                 if ( folder ) {
2032                     U8 folded= folder[ (U8) uvc ];
2033                     if ( !trie->charmap[ folded ] ) {
2034                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2035                         TRIE_STORE_REVCHAR( folded );
2036                     }
2037                 }
2038                 if ( !trie->charmap[ uvc ] ) {
2039                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2040                     TRIE_STORE_REVCHAR( uvc );
2041                 }
2042                 if ( set_bit ) {
2043                     /* store the codepoint in the bitmap, and its folded
2044                      * equivalent. */
2045                     TRIE_BITMAP_SET(trie, uvc);
2046
2047                     /* store the folded codepoint */
2048                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2049
2050                     if ( !UTF ) {
2051                         /* store first byte of utf8 representation of
2052                            variant codepoints */
2053                         if (! UVCHR_IS_INVARIANT(uvc)) {
2054                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2055                         }
2056                     }
2057                     set_bit = 0; /* We've done our bit :-) */
2058                 }
2059             } else {
2060                 SV** svpp;
2061                 if ( !widecharmap )
2062                     widecharmap = newHV();
2063
2064                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2065
2066                 if ( !svpp )
2067                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2068
2069                 if ( !SvTRUE( *svpp ) ) {
2070                     sv_setiv( *svpp, ++trie->uniquecharcount );
2071                     TRIE_STORE_REVCHAR(uvc);
2072                 }
2073             }
2074         }
2075         if( cur == first ) {
2076             trie->minlen = minbytes;
2077             trie->maxlen = maxbytes;
2078         } else if (minbytes < trie->minlen) {
2079             trie->minlen = minbytes;
2080         } else if (maxbytes > trie->maxlen) {
2081             trie->maxlen = maxbytes;
2082         }
2083     } /* end first pass */
2084     DEBUG_TRIE_COMPILE_r(
2085         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2086                 (int)depth * 2 + 2,"",
2087                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2088                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2089                 (int)trie->minlen, (int)trie->maxlen )
2090     );
2091
2092     /*
2093         We now know what we are dealing with in terms of unique chars and
2094         string sizes so we can calculate how much memory a naive
2095         representation using a flat table  will take. If it's over a reasonable
2096         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2097         conservative but potentially much slower representation using an array
2098         of lists.
2099
2100         At the end we convert both representations into the same compressed
2101         form that will be used in regexec.c for matching with. The latter
2102         is a form that cannot be used to construct with but has memory
2103         properties similar to the list form and access properties similar
2104         to the table form making it both suitable for fast searches and
2105         small enough that its feasable to store for the duration of a program.
2106
2107         See the comment in the code where the compressed table is produced
2108         inplace from the flat tabe representation for an explanation of how
2109         the compression works.
2110
2111     */
2112
2113
2114     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2115     prev_states[1] = 0;
2116
2117     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
2118         /*
2119             Second Pass -- Array Of Lists Representation
2120
2121             Each state will be represented by a list of charid:state records
2122             (reg_trie_trans_le) the first such element holds the CUR and LEN
2123             points of the allocated array. (See defines above).
2124
2125             We build the initial structure using the lists, and then convert
2126             it into the compressed table form which allows faster lookups
2127             (but cant be modified once converted).
2128         */
2129
2130         STRLEN transcount = 1;
2131
2132         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2133             "%*sCompiling trie using list compiler\n",
2134             (int)depth * 2 + 2, ""));
2135
2136         trie->states = (reg_trie_state *)
2137             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2138                                   sizeof(reg_trie_state) );
2139         TRIE_LIST_NEW(1);
2140         next_alloc = 2;
2141
2142         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2143
2144             regnode *noper   = NEXTOPER( cur );
2145             U8 *uc           = (U8*)STRING( noper );
2146             const U8 *e      = uc + STR_LEN( noper );
2147             U32 state        = 1;         /* required init */
2148             U16 charid       = 0;         /* sanity init */
2149             U32 wordlen      = 0;         /* required init */
2150
2151             if (OP(noper) == NOTHING) {
2152                 regnode *noper_next= regnext(noper);
2153                 if (noper_next != tail && OP(noper_next) == flags) {
2154                     noper = noper_next;
2155                     uc= (U8*)STRING(noper);
2156                     e= uc + STR_LEN(noper);
2157                 }
2158             }
2159
2160             if (OP(noper) != NOTHING) {
2161                 for ( ; uc < e ; uc += len ) {
2162
2163                     TRIE_READ_CHAR;
2164
2165                     if ( uvc < 256 ) {
2166                         charid = trie->charmap[ uvc ];
2167                     } else {
2168                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2169                         if ( !svpp ) {
2170                             charid = 0;
2171                         } else {
2172                             charid=(U16)SvIV( *svpp );
2173                         }
2174                     }
2175                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2176                     if ( charid ) {
2177
2178                         U16 check;
2179                         U32 newstate = 0;
2180
2181                         charid--;
2182                         if ( !trie->states[ state ].trans.list ) {
2183                             TRIE_LIST_NEW( state );
2184                         }
2185                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2186                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2187                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2188                                 break;
2189                             }
2190                         }
2191                         if ( ! newstate ) {
2192                             newstate = next_alloc++;
2193                             prev_states[newstate] = state;
2194                             TRIE_LIST_PUSH( state, charid, newstate );
2195                             transcount++;
2196                         }
2197                         state = newstate;
2198                     } else {
2199                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2200                     }
2201                 }
2202             }
2203             TRIE_HANDLE_WORD(state);
2204
2205         } /* end second pass */
2206
2207         /* next alloc is the NEXT state to be allocated */
2208         trie->statecount = next_alloc;
2209         trie->states = (reg_trie_state *)
2210             PerlMemShared_realloc( trie->states,
2211                                    next_alloc
2212                                    * sizeof(reg_trie_state) );
2213
2214         /* and now dump it out before we compress it */
2215         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2216                                                          revcharmap, next_alloc,
2217                                                          depth+1)
2218         );
2219
2220         trie->trans = (reg_trie_trans *)
2221             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2222         {
2223             U32 state;
2224             U32 tp = 0;
2225             U32 zp = 0;
2226
2227
2228             for( state=1 ; state < next_alloc ; state ++ ) {
2229                 U32 base=0;
2230
2231                 /*
2232                 DEBUG_TRIE_COMPILE_MORE_r(
2233                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2234                 );
2235                 */
2236
2237                 if (trie->states[state].trans.list) {
2238                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2239                     U16 maxid=minid;
2240                     U16 idx;
2241
2242                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2243                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2244                         if ( forid < minid ) {
2245                             minid=forid;
2246                         } else if ( forid > maxid ) {
2247                             maxid=forid;
2248                         }
2249                     }
2250                     if ( transcount < tp + maxid - minid + 1) {
2251                         transcount *= 2;
2252                         trie->trans = (reg_trie_trans *)
2253                             PerlMemShared_realloc( trie->trans,
2254                                                      transcount
2255                                                      * sizeof(reg_trie_trans) );
2256                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2257                     }
2258                     base = trie->uniquecharcount + tp - minid;
2259                     if ( maxid == minid ) {
2260                         U32 set = 0;
2261                         for ( ; zp < tp ; zp++ ) {
2262                             if ( ! trie->trans[ zp ].next ) {
2263                                 base = trie->uniquecharcount + zp - minid;
2264                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2265                                 trie->trans[ zp ].check = state;
2266                                 set = 1;
2267                                 break;
2268                             }
2269                         }
2270                         if ( !set ) {
2271                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2272                             trie->trans[ tp ].check = state;
2273                             tp++;
2274                             zp = tp;
2275                         }
2276                     } else {
2277                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2278                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2279                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2280                             trie->trans[ tid ].check = state;
2281                         }
2282                         tp += ( maxid - minid + 1 );
2283                     }
2284                     Safefree(trie->states[ state ].trans.list);
2285                 }
2286                 /*
2287                 DEBUG_TRIE_COMPILE_MORE_r(
2288                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2289                 );
2290                 */
2291                 trie->states[ state ].trans.base=base;
2292             }
2293             trie->lasttrans = tp + 1;
2294         }
2295     } else {
2296         /*
2297            Second Pass -- Flat Table Representation.
2298
2299            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2300            each.  We know that we will need Charcount+1 trans at most to store
2301            the data (one row per char at worst case) So we preallocate both
2302            structures assuming worst case.
2303
2304            We then construct the trie using only the .next slots of the entry
2305            structs.
2306
2307            We use the .check field of the first entry of the node temporarily
2308            to make compression both faster and easier by keeping track of how
2309            many non zero fields are in the node.
2310
2311            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2312            transition.
2313
2314            There are two terms at use here: state as a TRIE_NODEIDX() which is
2315            a number representing the first entry of the node, and state as a
2316            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2317            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2318            if there are 2 entrys per node. eg:
2319
2320              A B       A B
2321           1. 2 4    1. 3 7
2322           2. 0 3    3. 0 5
2323           3. 0 0    5. 0 0
2324           4. 0 0    7. 0 0
2325
2326            The table is internally in the right hand, idx form. However as we
2327            also have to deal with the states array which is indexed by nodenum
2328            we have to use TRIE_NODENUM() to convert.
2329
2330         */
2331         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2332             "%*sCompiling trie using table compiler\n",
2333             (int)depth * 2 + 2, ""));
2334
2335         trie->trans = (reg_trie_trans *)
2336             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2337                                   * trie->uniquecharcount + 1,
2338                                   sizeof(reg_trie_trans) );
2339         trie->states = (reg_trie_state *)
2340             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2341                                   sizeof(reg_trie_state) );
2342         next_alloc = trie->uniquecharcount + 1;
2343
2344
2345         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2346
2347             regnode *noper   = NEXTOPER( cur );
2348             const U8 *uc     = (U8*)STRING( noper );
2349             const U8 *e      = uc + STR_LEN( noper );
2350
2351             U32 state        = 1;         /* required init */
2352
2353             U16 charid       = 0;         /* sanity init */
2354             U32 accept_state = 0;         /* sanity init */
2355
2356             U32 wordlen      = 0;         /* required init */
2357
2358             if (OP(noper) == NOTHING) {
2359                 regnode *noper_next= regnext(noper);
2360                 if (noper_next != tail && OP(noper_next) == flags) {
2361                     noper = noper_next;
2362                     uc= (U8*)STRING(noper);
2363                     e= uc + STR_LEN(noper);
2364                 }
2365             }
2366
2367             if ( OP(noper) != NOTHING ) {
2368                 for ( ; uc < e ; uc += len ) {
2369
2370                     TRIE_READ_CHAR;
2371
2372                     if ( uvc < 256 ) {
2373                         charid = trie->charmap[ uvc ];
2374                     } else {
2375                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2376                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2377                     }
2378                     if ( charid ) {
2379                         charid--;
2380                         if ( !trie->trans[ state + charid ].next ) {
2381                             trie->trans[ state + charid ].next = next_alloc;
2382                             trie->trans[ state ].check++;
2383                             prev_states[TRIE_NODENUM(next_alloc)]
2384                                     = TRIE_NODENUM(state);
2385                             next_alloc += trie->uniquecharcount;
2386                         }
2387                         state = trie->trans[ state + charid ].next;
2388                     } else {
2389                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2390                     }
2391                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2392                 }
2393             }
2394             accept_state = TRIE_NODENUM( state );
2395             TRIE_HANDLE_WORD(accept_state);
2396
2397         } /* end second pass */
2398
2399         /* and now dump it out before we compress it */
2400         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2401                                                           revcharmap,
2402                                                           next_alloc, depth+1));
2403
2404         {
2405         /*
2406            * Inplace compress the table.*
2407
2408            For sparse data sets the table constructed by the trie algorithm will
2409            be mostly 0/FAIL transitions or to put it another way mostly empty.
2410            (Note that leaf nodes will not contain any transitions.)
2411
2412            This algorithm compresses the tables by eliminating most such
2413            transitions, at the cost of a modest bit of extra work during lookup:
2414
2415            - Each states[] entry contains a .base field which indicates the
2416            index in the state[] array wheres its transition data is stored.
2417
2418            - If .base is 0 there are no valid transitions from that node.
2419
2420            - If .base is nonzero then charid is added to it to find an entry in
2421            the trans array.
2422
2423            -If trans[states[state].base+charid].check!=state then the
2424            transition is taken to be a 0/Fail transition. Thus if there are fail
2425            transitions at the front of the node then the .base offset will point
2426            somewhere inside the previous nodes data (or maybe even into a node
2427            even earlier), but the .check field determines if the transition is
2428            valid.
2429
2430            XXX - wrong maybe?
2431            The following process inplace converts the table to the compressed
2432            table: We first do not compress the root node 1,and mark all its
2433            .check pointers as 1 and set its .base pointer as 1 as well. This
2434            allows us to do a DFA construction from the compressed table later,
2435            and ensures that any .base pointers we calculate later are greater
2436            than 0.
2437
2438            - We set 'pos' to indicate the first entry of the second node.
2439
2440            - We then iterate over the columns of the node, finding the first and
2441            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2442            and set the .check pointers accordingly, and advance pos
2443            appropriately and repreat for the next node. Note that when we copy
2444            the next pointers we have to convert them from the original
2445            NODEIDX form to NODENUM form as the former is not valid post
2446            compression.
2447
2448            - If a node has no transitions used we mark its base as 0 and do not
2449            advance the pos pointer.
2450
2451            - If a node only has one transition we use a second pointer into the
2452            structure to fill in allocated fail transitions from other states.
2453            This pointer is independent of the main pointer and scans forward
2454            looking for null transitions that are allocated to a state. When it
2455            finds one it writes the single transition into the "hole".  If the
2456            pointer doesnt find one the single transition is appended as normal.
2457
2458            - Once compressed we can Renew/realloc the structures to release the
2459            excess space.
2460
2461            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2462            specifically Fig 3.47 and the associated pseudocode.
2463
2464            demq
2465         */
2466         const U32 laststate = TRIE_NODENUM( next_alloc );
2467         U32 state, charid;
2468         U32 pos = 0, zp=0;
2469         trie->statecount = laststate;
2470
2471         for ( state = 1 ; state < laststate ; state++ ) {
2472             U8 flag = 0;
2473             const U32 stateidx = TRIE_NODEIDX( state );
2474             const U32 o_used = trie->trans[ stateidx ].check;
2475             U32 used = trie->trans[ stateidx ].check;
2476             trie->trans[ stateidx ].check = 0;
2477
2478             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2479                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2480                     if ( trie->trans[ stateidx + charid ].next ) {
2481                         if (o_used == 1) {
2482                             for ( ; zp < pos ; zp++ ) {
2483                                 if ( ! trie->trans[ zp ].next ) {
2484                                     break;
2485                                 }
2486                             }
2487                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2488                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2489                             trie->trans[ zp ].check = state;
2490                             if ( ++zp > pos ) pos = zp;
2491                             break;
2492                         }
2493                         used--;
2494                     }
2495                     if ( !flag ) {
2496                         flag = 1;
2497                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2498                     }
2499                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2500                     trie->trans[ pos ].check = state;
2501                     pos++;
2502                 }
2503             }
2504         }
2505         trie->lasttrans = pos + 1;
2506         trie->states = (reg_trie_state *)
2507             PerlMemShared_realloc( trie->states, laststate
2508                                    * sizeof(reg_trie_state) );
2509         DEBUG_TRIE_COMPILE_MORE_r(
2510                 PerlIO_printf( Perl_debug_log,
2511                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2512                     (int)depth * 2 + 2,"",
2513                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2514                     (IV)next_alloc,
2515                     (IV)pos,
2516                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2517             );
2518
2519         } /* end table compress */
2520     }
2521     DEBUG_TRIE_COMPILE_MORE_r(
2522             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2523                 (int)depth * 2 + 2, "",
2524                 (UV)trie->statecount,
2525                 (UV)trie->lasttrans)
2526     );
2527     /* resize the trans array to remove unused space */
2528     trie->trans = (reg_trie_trans *)
2529         PerlMemShared_realloc( trie->trans, trie->lasttrans
2530                                * sizeof(reg_trie_trans) );
2531
2532     {   /* Modify the program and insert the new TRIE node */
2533         U8 nodetype =(U8)(flags & 0xFF);
2534         char *str=NULL;
2535
2536 #ifdef DEBUGGING
2537         regnode *optimize = NULL;
2538 #ifdef RE_TRACK_PATTERN_OFFSETS
2539
2540         U32 mjd_offset = 0;
2541         U32 mjd_nodelen = 0;
2542 #endif /* RE_TRACK_PATTERN_OFFSETS */
2543 #endif /* DEBUGGING */
2544         /*
2545            This means we convert either the first branch or the first Exact,
2546            depending on whether the thing following (in 'last') is a branch
2547            or not and whther first is the startbranch (ie is it a sub part of
2548            the alternation or is it the whole thing.)
2549            Assuming its a sub part we convert the EXACT otherwise we convert
2550            the whole branch sequence, including the first.
2551          */
2552         /* Find the node we are going to overwrite */
2553         if ( first != startbranch || OP( last ) == BRANCH ) {
2554             /* branch sub-chain */
2555             NEXT_OFF( first ) = (U16)(last - first);
2556 #ifdef RE_TRACK_PATTERN_OFFSETS
2557             DEBUG_r({
2558                 mjd_offset= Node_Offset((convert));
2559                 mjd_nodelen= Node_Length((convert));
2560             });
2561 #endif
2562             /* whole branch chain */
2563         }
2564 #ifdef RE_TRACK_PATTERN_OFFSETS
2565         else {
2566             DEBUG_r({
2567                 const  regnode *nop = NEXTOPER( convert );
2568                 mjd_offset= Node_Offset((nop));
2569                 mjd_nodelen= Node_Length((nop));
2570             });
2571         }
2572         DEBUG_OPTIMISE_r(
2573             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2574                 (int)depth * 2 + 2, "",
2575                 (UV)mjd_offset, (UV)mjd_nodelen)
2576         );
2577 #endif
2578         /* But first we check to see if there is a common prefix we can
2579            split out as an EXACT and put in front of the TRIE node.  */
2580         trie->startstate= 1;
2581         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2582             U32 state;
2583             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2584                 U32 ofs = 0;
2585                 I32 idx = -1;
2586                 U32 count = 0;
2587                 const U32 base = trie->states[ state ].trans.base;
2588
2589                 if ( trie->states[state].wordnum )
2590                         count = 1;
2591
2592                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2593                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2594                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2595                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2596                     {
2597                         if ( ++count > 1 ) {
2598                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2599                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2600                             if ( state == 1 ) break;
2601                             if ( count == 2 ) {
2602                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2603                                 DEBUG_OPTIMISE_r(
2604                                     PerlIO_printf(Perl_debug_log,
2605                                         "%*sNew Start State=%"UVuf" Class: [",
2606                                         (int)depth * 2 + 2, "",
2607                                         (UV)state));
2608                                 if (idx >= 0) {
2609                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2610                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2611
2612                                     TRIE_BITMAP_SET(trie,*ch);
2613                                     if ( folder )
2614                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2615                                     DEBUG_OPTIMISE_r(
2616                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2617                                     );
2618                                 }
2619                             }
2620                             TRIE_BITMAP_SET(trie,*ch);
2621                             if ( folder )
2622                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2623                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2624                         }
2625                         idx = ofs;
2626                     }
2627                 }
2628                 if ( count == 1 ) {
2629                     SV **tmp = av_fetch( revcharmap, idx, 0);
2630                     STRLEN len;
2631                     char *ch = SvPV( *tmp, len );
2632                     DEBUG_OPTIMISE_r({
2633                         SV *sv=sv_newmortal();
2634                         PerlIO_printf( Perl_debug_log,
2635                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2636                             (int)depth * 2 + 2, "",
2637                             (UV)state, (UV)idx,
2638                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2639                                 PL_colors[0], PL_colors[1],
2640                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2641                                 PERL_PV_ESCAPE_FIRSTCHAR
2642                             )
2643                         );
2644                     });
2645                     if ( state==1 ) {
2646                         OP( convert ) = nodetype;
2647                         str=STRING(convert);
2648                         STR_LEN(convert)=0;
2649                     }
2650                     STR_LEN(convert) += len;
2651                     while (len--)
2652                         *str++ = *ch++;
2653                 } else {
2654 #ifdef DEBUGGING
2655                     if (state>1)
2656                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2657 #endif
2658                     break;
2659                 }
2660             }
2661             trie->prefixlen = (state-1);
2662             if (str) {
2663                 regnode *n = convert+NODE_SZ_STR(convert);
2664                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2665                 trie->startstate = state;
2666                 trie->minlen -= (state - 1);
2667                 trie->maxlen -= (state - 1);
2668 #ifdef DEBUGGING
2669                /* At least the UNICOS C compiler choked on this
2670                 * being argument to DEBUG_r(), so let's just have
2671                 * it right here. */
2672                if (
2673 #ifdef PERL_EXT_RE_BUILD
2674                    1
2675 #else
2676                    DEBUG_r_TEST
2677 #endif
2678                    ) {
2679                    regnode *fix = convert;
2680                    U32 word = trie->wordcount;
2681                    mjd_nodelen++;
2682                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2683                    while( ++fix < n ) {
2684                        Set_Node_Offset_Length(fix, 0, 0);
2685                    }
2686                    while (word--) {
2687                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2688                        if (tmp) {
2689                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2690                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2691                            else
2692                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2693                        }
2694                    }
2695                }
2696 #endif
2697                 if (trie->maxlen) {
2698                     convert = n;
2699                 } else {
2700                     NEXT_OFF(convert) = (U16)(tail - convert);
2701                     DEBUG_r(optimize= n);
2702                 }
2703             }
2704         }
2705         if (!jumper)
2706             jumper = last;
2707         if ( trie->maxlen ) {
2708             NEXT_OFF( convert ) = (U16)(tail - convert);
2709             ARG_SET( convert, data_slot );
2710             /* Store the offset to the first unabsorbed branch in
2711                jump[0], which is otherwise unused by the jump logic.
2712                We use this when dumping a trie and during optimisation. */
2713             if (trie->jump)
2714                 trie->jump[0] = (U16)(nextbranch - convert);
2715
2716             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2717              *   and there is a bitmap
2718              *   and the first "jump target" node we found leaves enough room
2719              * then convert the TRIE node into a TRIEC node, with the bitmap
2720              * embedded inline in the opcode - this is hypothetically faster.
2721              */
2722             if ( !trie->states[trie->startstate].wordnum
2723                  && trie->bitmap
2724                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2725             {
2726                 OP( convert ) = TRIEC;
2727                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2728                 PerlMemShared_free(trie->bitmap);
2729                 trie->bitmap= NULL;
2730             } else
2731                 OP( convert ) = TRIE;
2732
2733             /* store the type in the flags */
2734             convert->flags = nodetype;
2735             DEBUG_r({
2736             optimize = convert
2737                       + NODE_STEP_REGNODE
2738                       + regarglen[ OP( convert ) ];
2739             });
2740             /* XXX We really should free up the resource in trie now,
2741                    as we won't use them - (which resources?) dmq */
2742         }
2743         /* needed for dumping*/
2744         DEBUG_r(if (optimize) {
2745             regnode *opt = convert;
2746
2747             while ( ++opt < optimize) {
2748                 Set_Node_Offset_Length(opt,0,0);
2749             }
2750             /*
2751                 Try to clean up some of the debris left after the
2752                 optimisation.
2753              */
2754             while( optimize < jumper ) {
2755                 mjd_nodelen += Node_Length((optimize));
2756                 OP( optimize ) = OPTIMIZED;
2757                 Set_Node_Offset_Length(optimize,0,0);
2758                 optimize++;
2759             }
2760             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2761         });
2762     } /* end node insert */
2763
2764     /*  Finish populating the prev field of the wordinfo array.  Walk back
2765      *  from each accept state until we find another accept state, and if
2766      *  so, point the first word's .prev field at the second word. If the
2767      *  second already has a .prev field set, stop now. This will be the
2768      *  case either if we've already processed that word's accept state,
2769      *  or that state had multiple words, and the overspill words were
2770      *  already linked up earlier.
2771      */
2772     {
2773         U16 word;
2774         U32 state;
2775         U16 prev;
2776
2777         for (word=1; word <= trie->wordcount; word++) {
2778             prev = 0;
2779             if (trie->wordinfo[word].prev)
2780                 continue;
2781             state = trie->wordinfo[word].accept;
2782             while (state) {
2783                 state = prev_states[state];
2784                 if (!state)
2785                     break;
2786                 prev = trie->states[state].wordnum;
2787                 if (prev)
2788                     break;
2789             }
2790             trie->wordinfo[word].prev = prev;
2791         }
2792         Safefree(prev_states);
2793     }
2794
2795
2796     /* and now dump out the compressed format */
2797     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2798
2799     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2800 #ifdef DEBUGGING
2801     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2802     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2803 #else
2804     SvREFCNT_dec_NN(revcharmap);
2805 #endif
2806     return trie->jump
2807            ? MADE_JUMP_TRIE
2808            : trie->startstate>1
2809              ? MADE_EXACT_TRIE
2810              : MADE_TRIE;
2811 }
2812
2813 STATIC void
2814 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2815 {
2816 /* The Trie is constructed and compressed now so we can build a fail array if
2817  * it's needed
2818
2819    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2820    3.32 in the
2821    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2822    Ullman 1985/88
2823    ISBN 0-201-10088-6
2824
2825    We find the fail state for each state in the trie, this state is the longest
2826    proper suffix of the current state's 'word' that is also a proper prefix of
2827    another word in our trie. State 1 represents the word '' and is thus the
2828    default fail state. This allows the DFA not to have to restart after its
2829    tried and failed a word at a given point, it simply continues as though it
2830    had been matching the other word in the first place.
2831    Consider
2832       'abcdgu'=~/abcdefg|cdgu/
2833    When we get to 'd' we are still matching the first word, we would encounter
2834    'g' which would fail, which would bring us to the state representing 'd' in
2835    the second word where we would try 'g' and succeed, proceeding to match
2836    'cdgu'.
2837  */
2838  /* add a fail transition */
2839     const U32 trie_offset = ARG(source);
2840     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2841     U32 *q;
2842     const U32 ucharcount = trie->uniquecharcount;
2843     const U32 numstates = trie->statecount;
2844     const U32 ubound = trie->lasttrans + ucharcount;
2845     U32 q_read = 0;
2846     U32 q_write = 0;
2847     U32 charid;
2848     U32 base = trie->states[ 1 ].trans.base;
2849     U32 *fail;
2850     reg_ac_data *aho;
2851     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
2852     GET_RE_DEBUG_FLAGS_DECL;
2853
2854     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2855 #ifndef DEBUGGING
2856     PERL_UNUSED_ARG(depth);
2857 #endif
2858
2859
2860     ARG_SET( stclass, data_slot );
2861     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2862     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2863     aho->trie=trie_offset;
2864     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2865     Copy( trie->states, aho->states, numstates, reg_trie_state );
2866     Newxz( q, numstates, U32);
2867     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2868     aho->refcount = 1;
2869     fail = aho->fail;
2870     /* initialize fail[0..1] to be 1 so that we always have
2871        a valid final fail state */
2872     fail[ 0 ] = fail[ 1 ] = 1;
2873
2874     for ( charid = 0; charid < ucharcount ; charid++ ) {
2875         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2876         if ( newstate ) {
2877             q[ q_write ] = newstate;
2878             /* set to point at the root */
2879             fail[ q[ q_write++ ] ]=1;
2880         }
2881     }
2882     while ( q_read < q_write) {
2883         const U32 cur = q[ q_read++ % numstates ];
2884         base = trie->states[ cur ].trans.base;
2885
2886         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2887             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2888             if (ch_state) {
2889                 U32 fail_state = cur;
2890                 U32 fail_base;
2891                 do {
2892                     fail_state = fail[ fail_state ];
2893                     fail_base = aho->states[ fail_state ].trans.base;
2894                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2895
2896                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2897                 fail[ ch_state ] = fail_state;
2898                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2899                 {
2900                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2901                 }
2902                 q[ q_write++ % numstates] = ch_state;
2903             }
2904         }
2905     }
2906     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2907        when we fail in state 1, this allows us to use the
2908        charclass scan to find a valid start char. This is based on the principle
2909        that theres a good chance the string being searched contains lots of stuff
2910        that cant be a start char.
2911      */
2912     fail[ 0 ] = fail[ 1 ] = 0;
2913     DEBUG_TRIE_COMPILE_r({
2914         PerlIO_printf(Perl_debug_log,
2915                       "%*sStclass Failtable (%"UVuf" states): 0",
2916                       (int)(depth * 2), "", (UV)numstates
2917         );
2918         for( q_read=1; q_read<numstates; q_read++ ) {
2919             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2920         }
2921         PerlIO_printf(Perl_debug_log, "\n");
2922     });
2923     Safefree(q);
2924     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2925 }
2926
2927
2928 #define DEBUG_PEEP(str,scan,depth) \
2929     DEBUG_OPTIMISE_r({if (scan){ \
2930        SV * const mysv=sv_newmortal(); \
2931        regnode *Next = regnext(scan); \
2932        regprop(RExC_rx, mysv, scan); \
2933        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2934        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2935        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2936    }});
2937
2938
2939 /* The below joins as many adjacent EXACTish nodes as possible into a single
2940  * one.  The regop may be changed if the node(s) contain certain sequences that
2941  * require special handling.  The joining is only done if:
2942  * 1) there is room in the current conglomerated node to entirely contain the
2943  *    next one.
2944  * 2) they are the exact same node type
2945  *
2946  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2947  * these get optimized out
2948  *
2949  * If a node is to match under /i (folded), the number of characters it matches
2950  * can be different than its character length if it contains a multi-character
2951  * fold.  *min_subtract is set to the total delta of the input nodes.
2952  *
2953  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2954  * and contains LATIN SMALL LETTER SHARP S
2955  *
2956  * This is as good a place as any to discuss the design of handling these
2957  * multi-character fold sequences.  It's been wrong in Perl for a very long
2958  * time.  There are three code points in Unicode whose multi-character folds
2959  * were long ago discovered to mess things up.  The previous designs for
2960  * dealing with these involved assigning a special node for them.  This
2961  * approach doesn't work, as evidenced by this example:
2962  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2963  * Both these fold to "sss", but if the pattern is parsed to create a node that
2964  * would match just the \xDF, it won't be able to handle the case where a
2965  * successful match would have to cross the node's boundary.  The new approach
2966  * that hopefully generally solves the problem generates an EXACTFU_SS node
2967  * that is "sss".
2968  *
2969  * It turns out that there are problems with all multi-character folds, and not
2970  * just these three.  Now the code is general, for all such cases.  The
2971  * approach taken is:
2972  * 1)   This routine examines each EXACTFish node that could contain multi-
2973  *      character fold sequences.  It returns in *min_subtract how much to
2974  *      subtract from the the actual length of the string to get a real minimum
2975  *      match length; it is 0 if there are no multi-char folds.  This delta is
2976  *      used by the caller to adjust the min length of the match, and the delta
2977  *      between min and max, so that the optimizer doesn't reject these
2978  *      possibilities based on size constraints.
2979  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2980  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2981  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2982  *      there is a possible fold length change.  That means that a regular
2983  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2984  *      with length changes, and so can be processed faster.  regexec.c takes
2985  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2986  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2987  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2988  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2989  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2990  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2991  *      possibilities for the non-UTF8 patterns are quite simple, except for
2992  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2993  *      members of a fold-pair, and arrays are set up for all of them so that
2994  *      the other member of the pair can be found quickly.  Code elsewhere in
2995  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2996  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2997  *      described in the next item.
2998  * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2999  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3000  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
3001  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
3002  *      character in the pattern corresponds to at most a single character in
3003  *      the target string.  (And I do mean character, and not byte here, unlike
3004  *      other parts of the documentation that have never been updated to
3005  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
3006  *      two character string 'ss'; in EXACTFA nodes it can match
3007  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
3008  *      instances where it is violated.  I'm reluctant to try to change the
3009  *      assumption, as the code involved is impenetrable to me (khw), so
3010  *      instead the code here punts.  This routine examines (when the pattern
3011  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3012  *      boolean indicating whether or not the node contains a sharp s.  When it
3013  *      is true, the caller sets a flag that later causes the optimizer in this
3014  *      file to not set values for the floating and fixed string lengths, and
3015  *      thus avoids the optimizer code in regexec.c that makes the invalid
3016  *      assumption.  Thus, there is no optimization based on string lengths for
3017  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3018  *      (The reason the assumption is wrong only in these two cases is that all
3019  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3020  *      other folds to their expanded versions.  We can't prefold sharp s to
3021  *      'ss' in EXACTF nodes because we don't know at compile time if it
3022  *      actually matches 'ss' or not.  It will match iff the target string is
3023  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3024  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
3025  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3026  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3027  *      require the pattern to be forced into UTF-8, the overhead of which we
3028  *      want to avoid.)
3029  *
3030  *      Similarly, the code that generates tries doesn't currently handle
3031  *      not-already-folded multi-char folds, and it looks like a pain to change
3032  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3033  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3034  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3035  *      using /iaa matching will be doing so almost entirely with ASCII
3036  *      strings, so this should rarely be encountered in practice */
3037
3038 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
3039     if (PL_regkind[OP(scan)] == EXACT) \
3040         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
3041
3042 STATIC U32
3043 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
3044     /* Merge several consecutive EXACTish nodes into one. */
3045     regnode *n = regnext(scan);
3046     U32 stringok = 1;
3047     regnode *next = scan + NODE_SZ_STR(scan);
3048     U32 merged = 0;
3049     U32 stopnow = 0;
3050 #ifdef DEBUGGING
3051     regnode *stop = scan;
3052     GET_RE_DEBUG_FLAGS_DECL;
3053 #else
3054     PERL_UNUSED_ARG(depth);
3055 #endif
3056
3057     PERL_ARGS_ASSERT_JOIN_EXACT;
3058 #ifndef EXPERIMENTAL_INPLACESCAN
3059     PERL_UNUSED_ARG(flags);
3060     PERL_UNUSED_ARG(val);
3061 #endif
3062     DEBUG_PEEP("join",scan,depth);
3063
3064     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3065      * EXACT ones that are mergeable to the current one. */
3066     while (n
3067            && (PL_regkind[OP(n)] == NOTHING
3068                || (stringok && OP(n) == OP(scan)))
3069            && NEXT_OFF(n)
3070            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3071     {
3072
3073         if (OP(n) == TAIL || n > next)
3074             stringok = 0;
3075         if (PL_regkind[OP(n)] == NOTHING) {
3076             DEBUG_PEEP("skip:",n,depth);
3077             NEXT_OFF(scan) += NEXT_OFF(n);
3078             next = n + NODE_STEP_REGNODE;
3079 #ifdef DEBUGGING
3080             if (stringok)
3081                 stop = n;
3082 #endif
3083             n = regnext(n);
3084         }
3085         else if (stringok) {
3086             const unsigned int oldl = STR_LEN(scan);
3087             regnode * const nnext = regnext(n);
3088
3089             /* XXX I (khw) kind of doubt that this works on platforms where
3090              * U8_MAX is above 255 because of lots of other assumptions */
3091             /* Don't join if the sum can't fit into a single node */
3092             if (oldl + STR_LEN(n) > U8_MAX)
3093                 break;
3094
3095             DEBUG_PEEP("merg",n,depth);
3096             merged++;
3097
3098             NEXT_OFF(scan) += NEXT_OFF(n);
3099             STR_LEN(scan) += STR_LEN(n);
3100             next = n + NODE_SZ_STR(n);
3101             /* Now we can overwrite *n : */
3102             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3103 #ifdef DEBUGGING
3104             stop = next - 1;
3105 #endif
3106             n = nnext;
3107             if (stopnow) break;
3108         }
3109
3110 #ifdef EXPERIMENTAL_INPLACESCAN
3111         if (flags && !NEXT_OFF(n)) {
3112             DEBUG_PEEP("atch", val, depth);
3113             if (reg_off_by_arg[OP(n)]) {
3114                 ARG_SET(n, val - n);
3115             }
3116             else {
3117                 NEXT_OFF(n) = val - n;
3118             }
3119             stopnow = 1;
3120         }
3121 #endif
3122     }
3123
3124     *min_subtract = 0;
3125     *has_exactf_sharp_s = FALSE;
3126
3127     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3128      * can now analyze for sequences of problematic code points.  (Prior to
3129      * this final joining, sequences could have been split over boundaries, and
3130      * hence missed).  The sequences only happen in folding, hence for any
3131      * non-EXACT EXACTish node */
3132     if (OP(scan) != EXACT) {
3133         const U8 * const s0 = (U8*) STRING(scan);
3134         const U8 * s = s0;
3135         const U8 * const s_end = s0 + STR_LEN(scan);
3136
3137         /* One pass is made over the node's string looking for all the
3138          * possibilities.  to avoid some tests in the loop, there are two main
3139          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3140          * non-UTF-8 */
3141         if (UTF) {
3142
3143             /* Examine the string for a multi-character fold sequence.  UTF-8
3144              * patterns have all characters pre-folded by the time this code is
3145              * executed */
3146             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3147                                      length sequence we are looking for is 2 */
3148             {
3149                 int count = 0;
3150                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3151                 if (! len) {    /* Not a multi-char fold: get next char */
3152                     s += UTF8SKIP(s);
3153                     continue;
3154                 }
3155
3156                 /* Nodes with 'ss' require special handling, except for EXACTFL
3157                  * and EXACTFA-ish for which there is no multi-char fold to
3158                  * this */
3159                 if (len == 2 && *s == 's' && *(s+1) == 's'
3160                     && OP(scan) != EXACTFL
3161                     && OP(scan) != EXACTFA
3162                     && OP(scan) != EXACTFA_NO_TRIE)
3163                 {
3164                     count = 2;
3165                     OP(scan) = EXACTFU_SS;
3166                     s += 2;
3167                 }
3168                 else { /* Here is a generic multi-char fold. */
3169                     const U8* multi_end  = s + len;
3170
3171                     /* Count how many characters in it.  In the case of /l and
3172                      * /aa, no folds which contain ASCII code points are
3173                      * allowed, so check for those, and skip if found.  (In
3174                      * EXACTFL, no folds are allowed to any Latin1 code point,
3175                      * not just ASCII.  But there aren't any of these
3176                      * currently, nor ever likely, so don't take the time to
3177                      * test for them.  The code that generates the
3178                      * is_MULTI_foo() macros croaks should one actually get put
3179                      * into Unicode .) */
3180                     if (OP(scan) != EXACTFL
3181                         && OP(scan) != EXACTFA
3182                         && OP(scan) != EXACTFA_NO_TRIE)
3183                     {
3184                         count = utf8_length(s, multi_end);
3185                         s = multi_end;
3186                     }
3187                     else {
3188                         while (s < multi_end) {
3189                             if (isASCII(*s)) {
3190                                 s++;
3191                                 goto next_iteration;
3192                             }
3193                             else {
3194                                 s += UTF8SKIP(s);
3195                             }
3196                             count++;
3197                         }
3198                     }
3199                 }
3200
3201                 /* The delta is how long the sequence is minus 1 (1 is how long
3202                  * the character that folds to the sequence is) */
3203                 *min_subtract += count - 1;
3204             next_iteration: ;
3205             }
3206         }
3207         else if (OP(scan) == EXACTFA) {
3208
3209             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3210              * fold to the ASCII range (and there are no existing ones in the
3211              * upper latin1 range).  But, as outlined in the comments preceding
3212              * this function, we need to flag any occurrences of the sharp s.
3213              * This character forbids trie formation (because of added
3214              * complexity) */
3215             while (s < s_end) {
3216                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3217                     OP(scan) = EXACTFA_NO_TRIE;
3218                     *has_exactf_sharp_s = TRUE;
3219                     break;
3220                 }
3221                 s++;
3222                 continue;
3223             }
3224         }
3225         else if (OP(scan) != EXACTFL) {
3226
3227             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
3228              * multi-char folds that are all Latin1.  (This code knows that
3229              * there are no current multi-char folds possible with EXACTFL,
3230              * relying on fold_grind.t to catch any errors if the very unlikely
3231              * event happens that some get added in future Unicode versions.)
3232              * As explained in the comments preceding this function, we look
3233              * also for the sharp s in EXACTF nodes; it can be in the final
3234              * position.  Otherwise we can stop looking 1 byte earlier because
3235              * have to find at least two characters for a multi-fold */
3236             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3237
3238             while (s < upper) {
3239                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3240                 if (! len) {    /* Not a multi-char fold. */
3241                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3242                     {
3243                         *has_exactf_sharp_s = TRUE;
3244                     }
3245                     s++;
3246                     continue;
3247                 }
3248
3249                 if (len == 2
3250                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3251                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3252                 {
3253
3254                     /* EXACTF nodes need to know that the minimum length
3255                      * changed so that a sharp s in the string can match this
3256                      * ss in the pattern, but they remain EXACTF nodes, as they
3257                      * won't match this unless the target string is is UTF-8,
3258                      * which we don't know until runtime */
3259                     if (OP(scan) != EXACTF) {
3260                         OP(scan) = EXACTFU_SS;
3261                     }
3262                 }
3263
3264                 *min_subtract += len - 1;
3265                 s += len;
3266             }
3267         }
3268     }
3269
3270 #ifdef DEBUGGING
3271     /* Allow dumping but overwriting the collection of skipped
3272      * ops and/or strings with fake optimized ops */
3273     n = scan + NODE_SZ_STR(scan);
3274     while (n <= stop) {
3275         OP(n) = OPTIMIZED;
3276         FLAGS(n) = 0;
3277         NEXT_OFF(n) = 0;
3278         n++;
3279     }
3280 #endif
3281     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3282     return stopnow;
3283 }
3284
3285 /* REx optimizer.  Converts nodes into quicker variants "in place".
3286    Finds fixed substrings.  */
3287
3288 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3289    to the position after last scanned or to NULL. */
3290
3291 #define INIT_AND_WITHP \
3292     assert(!and_withp); \
3293     Newx(and_withp,1, regnode_ssc); \
3294     SAVEFREEPV(and_withp)
3295
3296 /* this is a chain of data about sub patterns we are processing that
3297    need to be handled separately/specially in study_chunk. Its so
3298    we can simulate recursion without losing state.  */
3299 struct scan_frame;
3300 typedef struct scan_frame {
3301     regnode *last;  /* last node to process in this frame */
3302     regnode *next;  /* next node to process when last is reached */
3303     struct scan_frame *prev; /*previous frame*/
3304     I32 stop; /* what stopparen do we use */
3305 } scan_frame;
3306
3307
3308 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3309
3310 STATIC SSize_t
3311 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3312                         SSize_t *minlenp, SSize_t *deltap,
3313                         regnode *last,
3314                         scan_data_t *data,
3315                         I32 stopparen,
3316                         U8* recursed,
3317                         regnode_ssc *and_withp,
3318                         U32 flags, U32 depth)
3319                         /* scanp: Start here (read-write). */
3320                         /* deltap: Write maxlen-minlen here. */
3321                         /* last: Stop before this one. */
3322                         /* data: string data about the pattern */
3323                         /* stopparen: treat close N as END */
3324                         /* recursed: which subroutines have we recursed into */
3325                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3326 {
3327     dVAR;
3328     /* There must be at least this number of characters to match */
3329     SSize_t min = 0;
3330     I32 pars = 0, code;
3331     regnode *scan = *scanp, *next;
3332     SSize_t delta = 0;
3333     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3334     int is_inf_internal = 0;            /* The studied chunk is infinite */
3335     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3336     scan_data_t data_fake;
3337     SV *re_trie_maxbuff = NULL;
3338     regnode *first_non_open = scan;
3339     SSize_t stopmin = SSize_t_MAX;
3340     scan_frame *frame = NULL;
3341     GET_RE_DEBUG_FLAGS_DECL;
3342
3343     PERL_ARGS_ASSERT_STUDY_CHUNK;
3344
3345 #ifdef DEBUGGING
3346     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3347 #endif
3348
3349     if ( depth == 0 ) {
3350         while (first_non_open && OP(first_non_open) == OPEN)
3351             first_non_open=regnext(first_non_open);
3352     }
3353
3354
3355   fake_study_recurse:
3356     while ( scan && OP(scan) != END && scan < last ){
3357         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3358                                    node length to get a real minimum (because
3359                                    the folded version may be shorter) */
3360         bool has_exactf_sharp_s = FALSE;
3361         /* Peephole optimizer: */
3362         DEBUG_STUDYDATA("Peep:", data,depth);
3363         DEBUG_PEEP("Peep",scan,depth);
3364
3365         /* Its not clear to khw or hv why this is done here, and not in the
3366          * clauses that deal with EXACT nodes.  khw's guess is that it's
3367          * because of a previous design */
3368         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3369
3370         /* Follow the next-chain of the current node and optimize
3371            away all the NOTHINGs from it.  */
3372         if (OP(scan) != CURLYX) {
3373             const int max = (reg_off_by_arg[OP(scan)]
3374                        ? I32_MAX
3375                        /* I32 may be smaller than U16 on CRAYs! */
3376                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3377             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3378             int noff;
3379             regnode *n = scan;
3380
3381             /* Skip NOTHING and LONGJMP. */
3382             while ((n = regnext(n))
3383                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3384                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3385                    && off + noff < max)
3386                 off += noff;
3387             if (reg_off_by_arg[OP(scan)])
3388                 ARG(scan) = off;
3389             else
3390                 NEXT_OFF(scan) = off;
3391         }
3392
3393
3394
3395         /* The principal pseudo-switch.  Cannot be a switch, since we
3396            look into several different things.  */
3397         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3398                    || OP(scan) == IFTHEN) {
3399             next = regnext(scan);
3400             code = OP(scan);
3401             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3402
3403             if (OP(next) == code || code == IFTHEN) {
3404                 /* NOTE - There is similar code to this block below for
3405                  * handling TRIE nodes on a re-study.  If you change stuff here
3406                  * check there too. */
3407                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3408                 regnode_ssc accum;
3409                 regnode * const startbranch=scan;
3410
3411                 if (flags & SCF_DO_SUBSTR)
3412                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3413                 if (flags & SCF_DO_STCLASS)
3414                     ssc_init_zero(pRExC_state, &accum);
3415
3416                 while (OP(scan) == code) {
3417                     SSize_t deltanext, minnext, fake;
3418                     I32 f = 0;
3419                     regnode_ssc this_class;
3420
3421                     num++;
3422                     data_fake.flags = 0;
3423                     if (data) {
3424                         data_fake.whilem_c = data->whilem_c;
3425                         data_fake.last_closep = data->last_closep;
3426                     }
3427                     else
3428                         data_fake.last_closep = &fake;
3429
3430                     data_fake.pos_delta = delta;
3431                     next = regnext(scan);
3432                     scan = NEXTOPER(scan);
3433                     if (code != BRANCH)
3434                         scan = NEXTOPER(scan);
3435                     if (flags & SCF_DO_STCLASS) {
3436                         ssc_init(pRExC_state, &this_class);
3437                         data_fake.start_class = &this_class;
3438                         f = SCF_DO_STCLASS_AND;
3439                     }
3440                     if (flags & SCF_WHILEM_VISITED_POS)
3441                         f |= SCF_WHILEM_VISITED_POS;
3442
3443                     /* we suppose the run is continuous, last=next...*/
3444                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3445                                           next, &data_fake,
3446                                           stopparen, recursed, NULL, f,depth+1);
3447                     if (min1 > minnext)
3448                         min1 = minnext;
3449                     if (deltanext == SSize_t_MAX) {
3450                         is_inf = is_inf_internal = 1;
3451                         max1 = SSize_t_MAX;
3452                     } else if (max1 < minnext + deltanext)
3453                         max1 = minnext + deltanext;
3454                     scan = next;
3455                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3456                         pars++;
3457                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3458                         if ( stopmin > minnext)
3459                             stopmin = min + min1;
3460                         flags &= ~SCF_DO_SUBSTR;
3461                         if (data)
3462                             data->flags |= SCF_SEEN_ACCEPT;
3463                     }
3464                     if (data) {
3465                         if (data_fake.flags & SF_HAS_EVAL)
3466                             data->flags |= SF_HAS_EVAL;
3467                         data->whilem_c = data_fake.whilem_c;
3468                     }
3469                     if (flags & SCF_DO_STCLASS)
3470                         ssc_or(pRExC_state, &accum, &this_class);
3471                 }
3472                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3473                     min1 = 0;
3474                 if (flags & SCF_DO_SUBSTR) {
3475                     data->pos_min += min1;
3476                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3477                         data->pos_delta = SSize_t_MAX;
3478                     else
3479                         data->pos_delta += max1 - min1;
3480                     if (max1 != min1 || is_inf)
3481                         data->longest = &(data->longest_float);
3482                 }
3483                 min += min1;
3484                 if (delta == SSize_t_MAX
3485                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3486                     delta = SSize_t_MAX;
3487                 else
3488                     delta += max1 - min1;
3489                 if (flags & SCF_DO_STCLASS_OR) {
3490                     ssc_or(pRExC_state, data->start_class, &accum);
3491                     if (min1) {
3492                         ssc_and(pRExC_state, data->start_class, and_withp);
3493                         flags &= ~SCF_DO_STCLASS;
3494                     }
3495                 }
3496                 else if (flags & SCF_DO_STCLASS_AND) {
3497                     if (min1) {
3498                         ssc_and(pRExC_state, data->start_class, &accum);
3499                         flags &= ~SCF_DO_STCLASS;
3500                     }
3501                     else {
3502                         /* Switch to OR mode: cache the old value of
3503                          * data->start_class */
3504                         INIT_AND_WITHP;
3505                         StructCopy(data->start_class, and_withp, regnode_ssc);
3506                         flags &= ~SCF_DO_STCLASS_AND;
3507                         StructCopy(&accum, data->start_class, regnode_ssc);
3508                         flags |= SCF_DO_STCLASS_OR;
3509                     }
3510                 }
3511
3512                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3513                 /* demq.
3514
3515                    Assuming this was/is a branch we are dealing with: 'scan'
3516                    now points at the item that follows the branch sequence,
3517                    whatever it is. We now start at the beginning of the
3518                    sequence and look for subsequences of
3519
3520                    BRANCH->EXACT=>x1
3521                    BRANCH->EXACT=>x2
3522                    tail
3523
3524                    which would be constructed from a pattern like
3525                    /A|LIST|OF|WORDS/
3526
3527                    If we can find such a subsequence we need to turn the first
3528                    element into a trie and then add the subsequent branch exact
3529                    strings to the trie.
3530
3531                    We have two cases
3532
3533                      1. patterns where the whole set of branches can be
3534                         converted.
3535
3536                      2. patterns where only a subset can be converted.
3537
3538                    In case 1 we can replace the whole set with a single regop
3539                    for the trie. In case 2 we need to keep the start and end
3540                    branches so
3541
3542                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3543                      becomes BRANCH TRIE; BRANCH X;
3544
3545                   There is an additional case, that being where there is a
3546                   common prefix, which gets split out into an EXACT like node
3547                   preceding the TRIE node.
3548
3549                   If x(1..n)==tail then we can do a simple trie, if not we make
3550                   a "jump" trie, such that when we match the appropriate word
3551                   we "jump" to the appropriate tail node. Essentially we turn
3552                   a nested if into a case structure of sorts.
3553
3554                 */
3555
3556                     int made=0;
3557                     if (!re_trie_maxbuff) {
3558                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3559                         if (!SvIOK(re_trie_maxbuff))
3560                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3561                     }
3562                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3563                         regnode *cur;
3564                         regnode *first = (regnode *)NULL;
3565                         regnode *last = (regnode *)NULL;
3566                         regnode *tail = scan;
3567                         U8 trietype = 0;
3568                         U32 count=0;
3569
3570 #ifdef DEBUGGING
3571                         SV * const mysv = sv_newmortal();       /* for dumping */
3572 #endif
3573                         /* var tail is used because there may be a TAIL
3574                            regop in the way. Ie, the exacts will point to the
3575                            thing following the TAIL, but the last branch will
3576                            point at the TAIL. So we advance tail. If we
3577                            have nested (?:) we may have to move through several
3578                            tails.
3579                          */
3580
3581                         while ( OP( tail ) == TAIL ) {
3582                             /* this is the TAIL generated by (?:) */
3583                             tail = regnext( tail );
3584                         }
3585
3586
3587                         DEBUG_TRIE_COMPILE_r({
3588                             regprop(RExC_rx, mysv, tail );
3589                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3590                                 (int)depth * 2 + 2, "",
3591                                 "Looking for TRIE'able sequences. Tail node is: ",
3592                                 SvPV_nolen_const( mysv )
3593                             );
3594                         });
3595
3596                         /*
3597
3598                             Step through the branches
3599                                 cur represents each branch,
3600                                 noper is the first thing to be matched as part
3601                                       of that branch
3602                                 noper_next is the regnext() of that node.
3603
3604                             We normally handle a case like this
3605                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3606                             support building with NOJUMPTRIE, which restricts
3607                             the trie logic to structures like /FOO|BAR/.
3608
3609                             If noper is a trieable nodetype then the branch is
3610                             a possible optimization target. If we are building
3611                             under NOJUMPTRIE then we require that noper_next is
3612                             the same as scan (our current position in the regex
3613                             program).
3614
3615                             Once we have two or more consecutive such branches
3616                             we can create a trie of the EXACT's contents and
3617                             stitch it in place into the program.
3618
3619                             If the sequence represents all of the branches in
3620                             the alternation we replace the entire thing with a
3621                             single TRIE node.
3622
3623                             Otherwise when it is a subsequence we need to
3624                             stitch it in place and replace only the relevant
3625                             branches. This means the first branch has to remain
3626                             as it is used by the alternation logic, and its
3627                             next pointer, and needs to be repointed at the item
3628                             on the branch chain following the last branch we
3629                             have optimized away.
3630
3631                             This could be either a BRANCH, in which case the
3632                             subsequence is internal, or it could be the item
3633                             following the branch sequence in which case the
3634                             subsequence is at the end (which does not
3635                             necessarily mean the first node is the start of the
3636                             alternation).
3637
3638                             TRIE_TYPE(X) is a define which maps the optype to a
3639                             trietype.
3640
3641                                 optype          |  trietype
3642                                 ----------------+-----------
3643                                 NOTHING         | NOTHING
3644                                 EXACT           | EXACT
3645                                 EXACTFU         | EXACTFU
3646                                 EXACTFU_SS      | EXACTFU
3647                                 EXACTFA         | EXACTFA
3648
3649
3650                         */
3651 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3652                        ( EXACT == (X) )   ? EXACT :        \
3653                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3654                        ( EXACTFA == (X) ) ? EXACTFA :        \
3655                        0 )
3656
3657                         /* dont use tail as the end marker for this traverse */
3658                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3659                             regnode * const noper = NEXTOPER( cur );
3660                             U8 noper_type = OP( noper );
3661                             U8 noper_trietype = TRIE_TYPE( noper_type );
3662 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3663                             regnode * const noper_next = regnext( noper );
3664                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3665                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3666 #endif
3667
3668                             DEBUG_TRIE_COMPILE_r({
3669                                 regprop(RExC_rx, mysv, cur);
3670                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3671                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3672
3673                                 regprop(RExC_rx, mysv, noper);
3674                                 PerlIO_printf( Perl_debug_log, " -> %s",
3675                                     SvPV_nolen_const(mysv));
3676
3677                                 if ( noper_next ) {
3678                                   regprop(RExC_rx, mysv, noper_next );
3679                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3680                                     SvPV_nolen_const(mysv));
3681                                 }
3682                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3683                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3684                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3685                                 );
3686                             });
3687
3688                             /* Is noper a trieable nodetype that can be merged
3689                              * with the current trie (if there is one)? */
3690                             if ( noper_trietype
3691                                   &&
3692                                   (
3693                                         ( noper_trietype == NOTHING)
3694                                         || ( trietype == NOTHING )
3695                                         || ( trietype == noper_trietype )
3696                                   )
3697 #ifdef NOJUMPTRIE
3698                                   && noper_next == tail
3699 #endif
3700                                   && count < U16_MAX)
3701                             {
3702                                 /* Handle mergable triable node Either we are
3703                                  * the first node in a new trieable sequence,
3704                                  * in which case we do some bookkeeping,
3705                                  * otherwise we update the end pointer. */
3706                                 if ( !first ) {
3707                                     first = cur;
3708                                     if ( noper_trietype == NOTHING ) {
3709 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3710                                         regnode * const noper_next = regnext( noper );
3711                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3712                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3713 #endif
3714
3715                                         if ( noper_next_trietype ) {
3716                                             trietype = noper_next_trietype;
3717                                         } else if (noper_next_type)  {
3718                                             /* a NOTHING regop is 1 regop wide.
3719                                              * We need at least two for a trie
3720                                              * so we can't merge this in */
3721                                             first = NULL;
3722                                         }
3723                                     } else {
3724                                         trietype = noper_trietype;
3725                                     }
3726                                 } else {
3727                                     if ( trietype == NOTHING )
3728                                         trietype = noper_trietype;
3729                                     last = cur;
3730                                 }
3731                                 if (first)
3732                                     count++;
3733                             } /* end handle mergable triable node */
3734                             else {
3735                                 /* handle unmergable node -
3736                                  * noper may either be a triable node which can
3737                                  * not be tried together with the current trie,
3738                                  * or a non triable node */
3739                                 if ( last ) {
3740                                     /* If last is set and trietype is not
3741                                      * NOTHING then we have found at least two
3742                                      * triable branch sequences in a row of a
3743                                      * similar trietype so we can turn them
3744                                      * into a trie. If/when we allow NOTHING to
3745                                      * start a trie sequence this condition
3746                                      * will be required, and it isn't expensive
3747                                      * so we leave it in for now. */
3748                                     if ( trietype && trietype != NOTHING )
3749                                         make_trie( pRExC_state,
3750                                                 startbranch, first, cur, tail, count,
3751                                                 trietype, depth+1 );
3752                                     last = NULL; /* note: we clear/update
3753                                                     first, trietype etc below,
3754                                                     so we dont do it here */
3755                                 }
3756                                 if ( noper_trietype
3757 #ifdef NOJUMPTRIE
3758                                      && noper_next == tail
3759 #endif
3760                                 ){
3761                                     /* noper is triable, so we can start a new
3762                                      * trie sequence */
3763                                     count = 1;
3764                                     first = cur;
3765                                     trietype = noper_trietype;
3766                                 } else if (first) {
3767                                     /* if we already saw a first but the
3768                                      * current node is not triable then we have
3769                                      * to reset the first information. */
3770                                     count = 0;
3771                                     first = NULL;
3772                                     trietype = 0;
3773                                 }
3774                             } /* end handle unmergable node */
3775                         } /* loop over branches */
3776                         DEBUG_TRIE_COMPILE_r({
3777                             regprop(RExC_rx, mysv, cur);
3778                             PerlIO_printf( Perl_debug_log,
3779                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3780                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3781
3782                         });
3783                         if ( last && trietype ) {
3784                             if ( trietype != NOTHING ) {
3785                                 /* the last branch of the sequence was part of
3786                                  * a trie, so we have to construct it here
3787                                  * outside of the loop */
3788                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3789 #ifdef TRIE_STUDY_OPT
3790                                 if ( ((made == MADE_EXACT_TRIE &&
3791                                      startbranch == first)
3792                                      || ( first_non_open == first )) &&
3793                                      depth==0 ) {
3794                                     flags |= SCF_TRIE_RESTUDY;
3795                                     if ( startbranch == first
3796                                          && scan == tail )
3797                                     {
3798                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3799                                     }
3800                                 }
3801 #endif
3802                             } else {
3803                                 /* at this point we know whatever we have is a
3804                                  * NOTHING sequence/branch AND if 'startbranch'
3805                                  * is 'first' then we can turn the whole thing
3806                                  * into a NOTHING
3807                                  */
3808                                 if ( startbranch == first ) {
3809                                     regnode *opt;
3810                                     /* the entire thing is a NOTHING sequence,
3811                                      * something like this: (?:|) So we can
3812                                      * turn it into a plain NOTHING op. */
3813                                     DEBUG_TRIE_COMPILE_r({
3814                                         regprop(RExC_rx, mysv, cur);
3815                                         PerlIO_printf( Perl_debug_log,
3816                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3817                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3818
3819                                     });
3820                                     OP(startbranch)= NOTHING;
3821                                     NEXT_OFF(startbranch)= tail - startbranch;
3822                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3823                                         OP(opt)= OPTIMIZED;
3824                                 }
3825                             }
3826                         } /* end if ( last) */
3827                     } /* TRIE_MAXBUF is non zero */
3828
3829                 } /* do trie */
3830
3831             }
3832             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3833                 scan = NEXTOPER(NEXTOPER(scan));
3834             } else                      /* single branch is optimized. */
3835                 scan = NEXTOPER(scan);
3836             continue;
3837         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3838             scan_frame *newframe = NULL;
3839             I32 paren;
3840             regnode *start;
3841             regnode *end;
3842
3843             if (OP(scan) != SUSPEND) {
3844             /* set the pointer */
3845                 if (OP(scan) == GOSUB) {
3846                     paren = ARG(scan);
3847                     RExC_recurse[ARG2L(scan)] = scan;
3848                     start = RExC_open_parens[paren-1];
3849                     end   = RExC_close_parens[paren-1];
3850                 } else {
3851                     paren = 0;
3852                     start = RExC_rxi->program + 1;
3853                     end   = RExC_opend;
3854                 }
3855                 if (!recursed) {
3856                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3857                     SAVEFREEPV(recursed);
3858                 }
3859                 if (!PAREN_TEST(recursed,paren+1)) {
3860                     PAREN_SET(recursed,paren+1);
3861                     Newx(newframe,1,scan_frame);
3862                 } else {
3863                     if (flags & SCF_DO_SUBSTR) {
3864                         SCAN_COMMIT(pRExC_state,data,minlenp);
3865                         data->longest = &(data->longest_float);
3866                     }
3867                     is_inf = is_inf_internal = 1;
3868                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3869                         ssc_anything(data->start_class);
3870                     flags &= ~SCF_DO_STCLASS;
3871                 }
3872             } else {
3873                 Newx(newframe,1,scan_frame);
3874                 paren = stopparen;
3875                 start = scan+2;
3876                 end = regnext(scan);
3877             }
3878             if (newframe) {
3879                 assert(start);
3880                 assert(end);
3881                 SAVEFREEPV(newframe);
3882                 newframe->next = regnext(scan);
3883                 newframe->last = last;
3884                 newframe->stop = stopparen;
3885                 newframe->prev = frame;
3886
3887                 frame = newframe;
3888                 scan =  start;
3889                 stopparen = paren;
3890                 last = end;
3891
3892                 continue;
3893             }
3894         }
3895         else if (OP(scan) == EXACT) {
3896             SSize_t l = STR_LEN(scan);
3897             UV uc;
3898             if (UTF) {
3899                 const U8 * const s = (U8*)STRING(scan);
3900                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3901                 l = utf8_length(s, s + l);
3902             } else {
3903                 uc = *((U8*)STRING(scan));
3904             }
3905             min += l;
3906             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3907                 /* The code below prefers earlier match for fixed
3908                    offset, later match for variable offset.  */
3909                 if (data->last_end == -1) { /* Update the start info. */
3910                     data->last_start_min = data->pos_min;
3911                     data->last_start_max = is_inf
3912                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3913                 }
3914                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3915                 if (UTF)
3916                     SvUTF8_on(data->last_found);
3917                 {
3918                     SV * const sv = data->last_found;
3919                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3920                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3921                     if (mg && mg->mg_len >= 0)
3922                         mg->mg_len += utf8_length((U8*)STRING(scan),
3923                                                   (U8*)STRING(scan)+STR_LEN(scan));
3924                 }
3925                 data->last_end = data->pos_min + l;
3926                 data->pos_min += l; /* As in the first entry. */
3927                 data->flags &= ~SF_BEFORE_EOL;
3928             }
3929
3930             /* ANDing the code point leaves at most it, and not in locale, and
3931              * can't match null string */
3932             if (flags & SCF_DO_STCLASS_AND) {
3933                 ssc_cp_and(data->start_class, uc);
3934                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
3935                 ssc_clear_locale(data->start_class);
3936             }
3937             else if (flags & SCF_DO_STCLASS_OR) {
3938                 ssc_add_cp(data->start_class, uc);
3939                 ssc_and(pRExC_state, data->start_class, and_withp);
3940             }
3941             flags &= ~SCF_DO_STCLASS;
3942         }
3943         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3944             SSize_t l = STR_LEN(scan);
3945             UV uc = *((U8*)STRING(scan));
3946             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
3947                                                      separate code points */
3948
3949             /* Search for fixed substrings supports EXACT only. */
3950             if (flags & SCF_DO_SUBSTR) {
3951                 assert(data);
3952                 SCAN_COMMIT(pRExC_state, data, minlenp);
3953             }
3954             if (UTF) {
3955                 const U8 * const s = (U8 *)STRING(scan);
3956                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3957                 l = utf8_length(s, s + l);
3958             }
3959             if (has_exactf_sharp_s) {
3960                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3961             }
3962             min += l - min_subtract;
3963             assert (min >= 0);
3964             delta += min_subtract;
3965             if (flags & SCF_DO_SUBSTR) {
3966                 data->pos_min += l - min_subtract;
3967                 if (data->pos_min < 0) {
3968                     data->pos_min = 0;
3969                 }
3970                 data->pos_delta += min_subtract;
3971                 if (min_subtract) {
3972                     data->longest = &(data->longest_float);
3973                 }
3974             }
3975             if (OP(scan) == EXACTFL) {
3976                 if (flags & SCF_DO_STCLASS_AND) {
3977                     ssc_flags_and(data->start_class,
3978                                                 ANYOF_LOCALE|ANYOF_LOC_FOLD);
3979                 }
3980                 else if (flags & SCF_DO_STCLASS_OR) {
3981                     ANYOF_FLAGS(data->start_class)
3982                                                 |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3983                 }
3984
3985                 /* We don't know what the folds are; it could be anything. XXX
3986                  * Actually, we only support UTF-8 encoding for code points
3987                  * above Latin1, so we could know what those folds are. */
3988                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
3989                                                        0,
3990                                                        UV_MAX);
3991             }
3992             else {  /* Non-locale EXACTFish */
3993                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
3994                 if (flags & SCF_DO_STCLASS_AND) {
3995                     ssc_clear_locale(data->start_class);
3996                 }
3997                 if (uc < 256) { /* We know what the Latin1 folds are ... */
3998                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
3999                                                        know if anything folds
4000                                                        with this */
4001                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4002                                                            PL_fold_latin1[uc]);
4003                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4004                                                       legal under /iaa */
4005                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4006                                 EXACTF_invlist
4007                                     = add_cp_to_invlist(EXACTF_invlist,
4008                                                 LATIN_SMALL_LETTER_SHARP_S);
4009                             }
4010                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4011                                 EXACTF_invlist
4012                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4013                                 EXACTF_invlist
4014                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4015                             }
4016                         }
4017
4018                         /* We also know if there are above-Latin1 code points
4019                          * that fold to this (none legal for ASCII and /iaa) */
4020                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4021                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4022                         {
4023                             /* XXX We could know exactly what does fold to this
4024                              * if the reverse folds are loaded, as currently in
4025                              * S_regclass() */
4026                             _invlist_union(EXACTF_invlist,
4027                                            PL_AboveLatin1,
4028                                            &EXACTF_invlist);
4029                         }
4030                     }
4031                 }
4032                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4033                            know what participates in folds with this, so have
4034                            to assume anything could */
4035
4036                     /* XXX We could know exactly what does fold to this if the
4037                      * reverse folds are loaded, as currently in S_regclass().
4038                      * But we do know that under /iaa nothing in the ASCII
4039                      * range can participate */
4040                     if (OP(scan) == EXACTFA) {
4041                         _invlist_union_complement_2nd(EXACTF_invlist,
4042                                                       PL_Posix_ptrs[_CC_ASCII],
4043                                                       &EXACTF_invlist);
4044                     }
4045                     else {
4046                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4047                                                                0, UV_MAX);
4048                     }
4049                 }
4050             }
4051             if (flags & SCF_DO_STCLASS_AND) {
4052                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4053                 ANYOF_POSIXL_ZERO(data->start_class);
4054                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4055             }
4056             else if (flags & SCF_DO_STCLASS_OR) {
4057                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4058                 ssc_and(pRExC_state, data->start_class, and_withp);
4059             }
4060             flags &= ~SCF_DO_STCLASS;
4061             SvREFCNT_dec(EXACTF_invlist);
4062         }
4063         else if (REGNODE_VARIES(OP(scan))) {
4064             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4065             I32 fl = 0, f = flags;
4066             regnode * const oscan = scan;
4067             regnode_ssc this_class;
4068             regnode_ssc *oclass = NULL;
4069             I32 next_is_eval = 0;
4070
4071             switch (PL_regkind[OP(scan)]) {
4072             case WHILEM:                /* End of (?:...)* . */
4073                 scan = NEXTOPER(scan);
4074                 goto finish;
4075             case PLUS:
4076                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4077                     next = NEXTOPER(scan);
4078                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4079                         mincount = 1;
4080                         maxcount = REG_INFTY;
4081                         next = regnext(scan);
4082                         scan = NEXTOPER(scan);
4083                         goto do_curly;
4084                     }
4085                 }
4086                 if (flags & SCF_DO_SUBSTR)
4087                     data->pos_min++;
4088                 min++;
4089                 /* Fall through. */
4090             case STAR:
4091                 if (flags & SCF_DO_STCLASS) {
4092                     mincount = 0;
4093                     maxcount = REG_INFTY;
4094                     next = regnext(scan);
4095                     scan = NEXTOPER(scan);
4096                     goto do_curly;
4097                 }
4098                 is_inf = is_inf_internal = 1;
4099                 scan = regnext(scan);
4100                 if (flags & SCF_DO_SUBSTR) {
4101                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
4102                     data->longest = &(data->longest_float);
4103                 }
4104                 goto optimize_curly_tail;
4105             case CURLY:
4106                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4107                     && (scan->flags == stopparen))
4108                 {
4109                     mincount = 1;
4110                     maxcount = 1;
4111                 } else {
4112                     mincount = ARG1(scan);
4113                     maxcount = ARG2(scan);
4114                 }
4115                 next = regnext(scan);
4116                 if (OP(scan) == CURLYX) {
4117                     I32 lp = (data ? *(data->last_closep) : 0);
4118                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4119                 }
4120                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4121                 next_is_eval = (OP(scan) == EVAL);
4122               do_curly:
4123                 if (flags & SCF_DO_SUBSTR) {
4124                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
4125                     pos_before = data->pos_min;
4126                 }
4127                 if (data) {
4128                     fl = data->flags;
4129                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4130                     if (is_inf)
4131                         data->flags |= SF_IS_INF;
4132                 }
4133                 if (flags & SCF_DO_STCLASS) {
4134                     ssc_init(pRExC_state, &this_class);
4135                     oclass = data->start_class;
4136                     data->start_class = &this_class;
4137                     f |= SCF_DO_STCLASS_AND;
4138                     f &= ~SCF_DO_STCLASS_OR;
4139                 }
4140                 /* Exclude from super-linear cache processing any {n,m}
4141                    regops for which the combination of input pos and regex
4142                    pos is not enough information to determine if a match
4143                    will be possible.
4144
4145                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4146                    regex pos at the \s*, the prospects for a match depend not
4147                    only on the input position but also on how many (bar\s*)
4148                    repeats into the {4,8} we are. */
4149                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4150                     f &= ~SCF_WHILEM_VISITED_POS;
4151
4152                 /* This will finish on WHILEM, setting scan, or on NULL: */
4153                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4154                                       last, data, stopparen, recursed, NULL,
4155                                       (mincount == 0
4156                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
4157
4158                 if (flags & SCF_DO_STCLASS)
4159                     data->start_class = oclass;
4160                 if (mincount == 0 || minnext == 0) {
4161                     if (flags & SCF_DO_STCLASS_OR) {
4162                         ssc_or(pRExC_state, data->start_class, &this_class);
4163                     }
4164                     else if (flags & SCF_DO_STCLASS_AND) {
4165                         /* Switch to OR mode: cache the old value of
4166                          * data->start_class */
4167                         INIT_AND_WITHP;
4168                         StructCopy(data->start_class, and_withp, regnode_ssc);
4169                         flags &= ~SCF_DO_STCLASS_AND;
4170                         StructCopy(&this_class, data->start_class, regnode_ssc);
4171                         flags |= SCF_DO_STCLASS_OR;
4172                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4173                     }
4174                 } else {                /* Non-zero len */
4175                     if (flags & SCF_DO_STCLASS_OR) {
4176                         ssc_or(pRExC_state, data->start_class, &this_class);
4177                         ssc_and(pRExC_state, data->start_class, and_withp);
4178                     }
4179                     else if (flags & SCF_DO_STCLASS_AND)
4180                         ssc_and(pRExC_state, data->start_class, &this_class);
4181                     flags &= ~SCF_DO_STCLASS;
4182                 }
4183                 if (!scan)              /* It was not CURLYX, but CURLY. */
4184                     scan = next;
4185                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4186                     /* ? quantifier ok, except for (?{ ... }) */
4187                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4188                     && (minnext == 0) && (deltanext == 0)
4189                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4190                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
4191                 {
4192                     /* Fatal warnings may leak the regexp without this: */
4193                     SAVEFREESV(RExC_rx_sv);
4194                     ckWARNreg(RExC_parse,
4195                               "Quantifier unexpected on zero-length expression");
4196                     (void)ReREFCNT_inc(RExC_rx_sv);
4197                 }
4198
4199                 min += minnext * mincount;
4200                 is_inf_internal |= deltanext == SSize_t_MAX
4201                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
4202                 is_inf |= is_inf_internal;
4203                 if (is_inf)
4204                     delta = SSize_t_MAX;
4205                 else
4206                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
4207
4208                 /* Try powerful optimization CURLYX => CURLYN. */
4209                 if (  OP(oscan) == CURLYX && data
4210                       && data->flags & SF_IN_PAR
4211                       && !(data->flags & SF_HAS_EVAL)
4212                       && !deltanext && minnext == 1 ) {
4213                     /* Try to optimize to CURLYN.  */
4214                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4215                     regnode * const nxt1 = nxt;
4216 #ifdef DEBUGGING
4217                     regnode *nxt2;
4218 #endif
4219
4220                     /* Skip open. */
4221                     nxt = regnext(nxt);
4222                     if (!REGNODE_SIMPLE(OP(nxt))
4223                         && !(PL_regkind[OP(nxt)] == EXACT
4224                              && STR_LEN(nxt) == 1))
4225                         goto nogo;
4226 #ifdef DEBUGGING
4227                     nxt2 = nxt;
4228 #endif
4229                     nxt = regnext(nxt);
4230                     if (OP(nxt) != CLOSE)
4231                         goto nogo;
4232                     if (RExC_open_parens) {
4233                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4234                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4235                     }
4236                     /* Now we know that nxt2 is the only contents: */
4237                     oscan->flags = (U8)ARG(nxt);
4238                     OP(oscan) = CURLYN;
4239                     OP(nxt1) = NOTHING; /* was OPEN. */
4240
4241 #ifdef DEBUGGING
4242                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4243                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4244                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4245                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4246                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4247                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4248 #endif
4249                 }
4250               nogo:
4251
4252                 /* Try optimization CURLYX => CURLYM. */
4253                 if (  OP(oscan) == CURLYX && data
4254                       && !(data->flags & SF_HAS_PAR)
4255                       && !(data->flags & SF_HAS_EVAL)
4256                       && !deltanext     /* atom is fixed width */
4257                       && minnext != 0   /* CURLYM can't handle zero width */
4258                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4259                 ) {
4260                     /* XXXX How to optimize if data == 0? */
4261                     /* Optimize to a simpler form.  */
4262                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4263                     regnode *nxt2;
4264
4265                     OP(oscan) = CURLYM;
4266                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4267                             && (OP(nxt2) != WHILEM))
4268                         nxt = nxt2;
4269                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4270                     /* Need to optimize away parenths. */
4271                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4272                         /* Set the parenth number.  */
4273                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4274
4275                         oscan->flags = (U8)ARG(nxt);
4276                         if (RExC_open_parens) {
4277                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4278                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4279                         }
4280                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4281                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4282
4283 #ifdef DEBUGGING
4284                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4285                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4286                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4287                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4288 #endif
4289 #if 0
4290                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4291                             regnode *nnxt = regnext(nxt1);
4292                             if (nnxt == nxt) {
4293                                 if (reg_off_by_arg[OP(nxt1)])
4294                                     ARG_SET(nxt1, nxt2 - nxt1);
4295                                 else if (nxt2 - nxt1 < U16_MAX)
4296                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4297                                 else
4298                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4299                             }
4300                             nxt1 = nnxt;
4301                         }
4302 #endif
4303                         /* Optimize again: */
4304                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4305                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4306                     }
4307                     else
4308                         oscan->flags = 0;
4309                 }
4310                 else if ((OP(oscan) == CURLYX)
4311                          && (flags & SCF_WHILEM_VISITED_POS)
4312                          /* See the comment on a similar expression above.
4313                             However, this time it's not a subexpression
4314                             we care about, but the expression itself. */
4315                          && (maxcount == REG_INFTY)
4316                          && data && ++data->whilem_c < 16) {
4317                     /* This stays as CURLYX, we can put the count/of pair. */
4318                     /* Find WHILEM (as in regexec.c) */
4319                     regnode *nxt = oscan + NEXT_OFF(oscan);
4320
4321                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4322                         nxt += ARG(nxt);
4323                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4324                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4325                 }
4326                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4327                     pars++;
4328                 if (flags & SCF_DO_SUBSTR) {
4329                     SV *last_str = NULL;
4330                     int counted = mincount != 0;
4331
4332                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4333                         SSize_t b = pos_before >= data->last_start_min
4334                             ? pos_before : data->last_start_min;
4335                         STRLEN l;
4336                         const char * const s = SvPV_const(data->last_found, l);
4337                         SSize_t old = b - data->last_start_min;
4338
4339                         if (UTF)
4340                             old = utf8_hop((U8*)s, old) - (U8*)s;
4341                         l -= old;
4342                         /* Get the added string: */
4343                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4344                         if (deltanext == 0 && pos_before == b) {
4345                             /* What was added is a constant string */
4346                             if (mincount > 1) {
4347                                 SvGROW(last_str, (mincount * l) + 1);
4348                                 repeatcpy(SvPVX(last_str) + l,
4349                                           SvPVX_const(last_str), l, mincount - 1);
4350                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4351                                 /* Add additional parts. */
4352                                 SvCUR_set(data->last_found,
4353                                           SvCUR(data->last_found) - l);
4354                                 sv_catsv(data->last_found, last_str);
4355                                 {
4356                                     SV * sv = data->last_found;
4357                                     MAGIC *mg =
4358                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4359                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4360                                     if (mg && mg->mg_len >= 0)
4361                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4362                                 }
4363                                 data->last_end += l * (mincount - 1);
4364                             }
4365                         } else {
4366                             /* start offset must point into the last copy */
4367                             data->last_start_min += minnext * (mincount - 1);
4368                             data->last_start_max += is_inf ? SSize_t_MAX
4369                                 : (maxcount - 1) * (minnext + data->pos_delta);
4370                         }
4371                     }
4372                     /* It is counted once already... */
4373                     data->pos_min += minnext * (mincount - counted);
4374 #if 0
4375 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4376                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4377                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4378     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4379     (UV)mincount);
4380 if (deltanext != SSize_t_MAX)
4381 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4382     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4383           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4384 #endif
4385                     if (deltanext == SSize_t_MAX ||
4386                         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4387                         data->pos_delta = SSize_t_MAX;
4388                     else
4389                         data->pos_delta += - counted * deltanext +
4390                         (minnext + deltanext) * maxcount - minnext * mincount;
4391                     if (mincount != maxcount) {
4392                          /* Cannot extend fixed substrings found inside
4393                             the group.  */
4394                         SCAN_COMMIT(pRExC_state,data,minlenp);
4395                         if (mincount && last_str) {
4396                             SV * const sv = data->last_found;
4397                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4398                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4399
4400                             if (mg)
4401                                 mg->mg_len = -1;
4402                             sv_setsv(sv, last_str);
4403                             data->last_end = data->pos_min;
4404                             data->last_start_min =
4405                                 data->pos_min - CHR_SVLEN(last_str);
4406                             data->last_start_max = is_inf
4407                                 ? SSize_t_MAX
4408                                 : data->pos_min + data->pos_delta
4409                                 - CHR_SVLEN(last_str);
4410                         }
4411                         data->longest = &(data->longest_float);
4412                     }
4413                     SvREFCNT_dec(last_str);
4414                 }
4415                 if (data && (fl & SF_HAS_EVAL))
4416                     data->flags |= SF_HAS_EVAL;
4417               optimize_curly_tail:
4418                 if (OP(oscan) != CURLYX) {
4419                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4420                            && NEXT_OFF(next))
4421                         NEXT_OFF(oscan) += NEXT_OFF(next);
4422                 }
4423                 continue;
4424
4425             default:
4426 #ifdef DEBUGGING
4427                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4428                                                                     OP(scan));
4429 #endif
4430             case REF:
4431             case CLUMP:
4432                 if (flags & SCF_DO_SUBSTR) {
4433                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4434                     data->longest = &(data->longest_float);
4435                 }
4436                 is_inf = is_inf_internal = 1;
4437                 if (flags & SCF_DO_STCLASS_OR) {
4438                     if (OP(scan) == CLUMP) {
4439                         /* Actually is any start char, but very few code points
4440                          * aren't start characters */
4441                         ssc_match_all_cp(data->start_class);
4442                     }
4443                     else {
4444                         ssc_anything(data->start_class);
4445                     }
4446                 }
4447                 flags &= ~SCF_DO_STCLASS;
4448                 break;
4449             }
4450         }
4451         else if (OP(scan) == LNBREAK) {
4452             if (flags & SCF_DO_STCLASS) {
4453                 if (flags & SCF_DO_STCLASS_AND) {
4454                     ssc_intersection(data->start_class,
4455                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4456                     ssc_clear_locale(data->start_class);
4457                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4458                 }
4459                 else if (flags & SCF_DO_STCLASS_OR) {
4460                     ssc_union(data->start_class,
4461                               PL_XPosix_ptrs[_CC_VERTSPACE],
4462                               FALSE);
4463                     ssc_and(pRExC_state, data->start_class, and_withp);
4464                 }
4465                 flags &= ~SCF_DO_STCLASS;
4466             }
4467             min++;
4468             delta++;    /* Because of the 2 char string cr-lf */
4469             if (flags & SCF_DO_SUBSTR) {
4470                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4471                 data->pos_min += 1;
4472                 data->pos_delta += 1;
4473                 data->longest = &(data->longest_float);
4474             }
4475         }
4476         else if (REGNODE_SIMPLE(OP(scan))) {
4477
4478             if (flags & SCF_DO_SUBSTR) {
4479                 SCAN_COMMIT(pRExC_state,data,minlenp);
4480                 data->pos_min++;
4481             }
4482             min++;
4483             if (flags & SCF_DO_STCLASS) {
4484                 bool invert = 0;
4485                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4486                 U8 classnum;
4487                 U8 namedclass;
4488
4489                 if (flags & SCF_DO_STCLASS_AND) {
4490                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4491                 }
4492
4493                 /* Some of the logic below assumes that switching
4494                    locale on will only add false positives. */
4495                 switch (OP(scan)) {
4496
4497                 default:
4498 #ifdef DEBUGGING
4499                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4500 #endif
4501                 case CANY:
4502                 case SANY:
4503                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4504                         ssc_match_all_cp(data->start_class);
4505                     break;
4506
4507                 case REG_ANY:
4508                     {
4509                         SV* REG_ANY_invlist = _new_invlist(2);
4510                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4511                                                             '\n');
4512                         if (flags & SCF_DO_STCLASS_OR) {
4513                             ssc_union(data->start_class,
4514                                       REG_ANY_invlist,
4515                                       TRUE /* TRUE => invert, hence all but \n
4516                                             */
4517                                       );
4518                         }
4519                         else if (flags & SCF_DO_STCLASS_AND) {
4520                             ssc_intersection(data->start_class,
4521                                              REG_ANY_invlist,
4522                                              TRUE  /* TRUE => invert */
4523                                              );
4524                             ssc_clear_locale(data->start_class);
4525                         }
4526                         SvREFCNT_dec_NN(REG_ANY_invlist);
4527                     }
4528                     break;
4529
4530                 case ANYOF_WARN_SUPER:
4531                 case ANYOF:
4532                     if (flags & SCF_DO_STCLASS_AND)
4533                         ssc_and(pRExC_state, data->start_class,
4534                                 (regnode_ssc*) scan);
4535                     else
4536                         ssc_or(pRExC_state, data->start_class,
4537                                                           (regnode_ssc*)scan);
4538                     break;
4539
4540                 case NPOSIXL:
4541                     invert = 1;
4542                     /* FALL THROUGH */
4543
4544                 case POSIXL:
4545                     classnum = FLAGS(scan);
4546                     namedclass = classnum_to_namedclass(classnum) + invert;
4547                     if (flags & SCF_DO_STCLASS_AND) {
4548                         bool was_there = cBOOL(
4549                                           ANYOF_POSIXL_TEST(data->start_class,
4550                                                                  namedclass));
4551                         ANYOF_POSIXL_ZERO(data->start_class);
4552                         if (was_there) {    /* Do an AND */
4553                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4554                         }
4555                         /* No individual code points can now match */
4556                         data->start_class->invlist
4557                                                 = sv_2mortal(_new_invlist(0));
4558                     }
4559                     else {
4560                         int complement = namedclass + ((invert) ? -1 : 1);
4561
4562                         assert(flags & SCF_DO_STCLASS_OR);
4563
4564                         /* If the complement of this class was already there,
4565                          * the result is that they match all code points,
4566                          * (\d + \D == everything).  Remove the classes from
4567                          * future consideration.  Locale is not relevant in
4568                          * this case */
4569                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4570                             ssc_match_all_cp(data->start_class);
4571                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4572                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4573                             if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
4574                             {
4575                                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
4576                             }
4577                         }
4578                         else {  /* The usual case; just add this class to the
4579                                    existing set */
4580                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4581                             ANYOF_FLAGS(data->start_class)
4582                                                 |= ANYOF_LOCALE|ANYOF_POSIXL;
4583                         }
4584                     }
4585                     break;
4586
4587                 case NPOSIXA:   /* For these, we always know the exact set of
4588                                    what's matched */
4589                     invert = 1;
4590                     /* FALL THROUGH */
4591                 case POSIXA:
4592                     classnum = FLAGS(scan);
4593                     my_invlist = PL_Posix_ptrs[classnum];
4594                     goto join_posix;
4595
4596                 case NPOSIXD:
4597                 case NPOSIXU:
4598                     invert = 1;
4599                     /* FALL THROUGH */
4600                 case POSIXD:
4601                 case POSIXU:
4602                     classnum = FLAGS(scan);
4603
4604                     /* If we know all the code points that match the class, use
4605                      * that; otherwise use the Latin1 code points, plus we have
4606                      * to assume that it could match anything above Latin1 */
4607                     if (PL_XPosix_ptrs[classnum]) {
4608                         my_invlist = invlist_clone(PL_XPosix_ptrs[classnum]);
4609                     }
4610                     else {
4611                         _invlist_union(PL_L1Posix_ptrs[classnum],
4612                                        PL_AboveLatin1, &my_invlist);
4613                     }
4614
4615                     /* NPOSIXD matches all upper Latin1 code points unless the
4616                      * target string being matched is UTF-8, which is
4617                      * unknowable until match time */
4618                     if (PL_regkind[OP(scan)] == NPOSIXD) {
4619                         _invlist_union_complement_2nd(my_invlist,
4620                                         PL_Posix_ptrs[_CC_ASCII], &my_invlist);
4621                     }
4622
4623                   join_posix:
4624
4625                     if (flags & SCF_DO_STCLASS_AND) {
4626                         ssc_intersection(data->start_class, my_invlist, invert);
4627                         ssc_clear_locale(data->start_class);
4628                     }
4629                     else {
4630                         assert(flags & SCF_DO_STCLASS_OR);
4631                         ssc_union(data->start_class, my_invlist, invert);
4632                     }
4633                 }
4634                 if (flags & SCF_DO_STCLASS_OR)
4635                     ssc_and(pRExC_state, data->start_class, and_withp);
4636                 flags &= ~SCF_DO_STCLASS;
4637             }
4638         }
4639         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4640             data->flags |= (OP(scan) == MEOL
4641                             ? SF_BEFORE_MEOL
4642                             : SF_BEFORE_SEOL);
4643             SCAN_COMMIT(pRExC_state, data, minlenp);
4644
4645         }
4646         else if (  PL_regkind[OP(scan)] == BRANCHJ
4647                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4648                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4649                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4650             if ( OP(scan) == UNLESSM &&
4651                  scan->flags == 0 &&
4652                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4653                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4654             ) {
4655                 regnode *opt;
4656                 regnode *upto= regnext(scan);
4657                 DEBUG_PARSE_r({
4658                     SV * const mysv_val=sv_newmortal();
4659                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4660
4661                     /*DEBUG_PARSE_MSG("opfail");*/
4662                     regprop(RExC_rx, mysv_val, upto);
4663                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4664                                   SvPV_nolen_const(mysv_val),
4665                                   (IV)REG_NODE_NUM(upto),
4666                                   (IV)(upto - scan)
4667                     );
4668                 });
4669                 OP(scan) = OPFAIL;
4670                 NEXT_OFF(scan) = upto - scan;
4671                 for (opt= scan + 1; opt < upto ; opt++)
4672                     OP(opt) = OPTIMIZED;
4673                 scan= upto;
4674                 continue;
4675             }
4676             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4677                 || OP(scan) == UNLESSM )
4678             {
4679                 /* Negative Lookahead/lookbehind
4680                    In this case we can't do fixed string optimisation.
4681                 */
4682
4683                 SSize_t deltanext, minnext, fake = 0;
4684                 regnode *nscan;
4685                 regnode_ssc intrnl;
4686                 int f = 0;
4687
4688                 data_fake.flags = 0;
4689                 if (data) {
4690                     data_fake.whilem_c = data->whilem_c;
4691                     data_fake.last_closep = data->last_closep;
4692                 }
4693                 else
4694                     data_fake.last_closep = &fake;
4695                 data_fake.pos_delta = delta;
4696                 if ( flags & SCF_DO_STCLASS && !scan->flags
4697                      && OP(scan) == IFMATCH ) { /* Lookahead */
4698                     ssc_init(pRExC_state, &intrnl);
4699                     data_fake.start_class = &intrnl;
4700                     f |= SCF_DO_STCLASS_AND;
4701                 }
4702                 if (flags & SCF_WHILEM_VISITED_POS)
4703                     f |= SCF_WHILEM_VISITED_POS;
4704                 next = regnext(scan);
4705                 nscan = NEXTOPER(NEXTOPER(scan));
4706                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4707                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4708                 if (scan->flags) {
4709                     if (deltanext) {
4710                         FAIL("Variable length lookbehind not implemented");
4711                     }
4712                     else if (minnext > (I32)U8_MAX) {
4713                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4714                     }
4715                     scan->flags = (U8)minnext;
4716                 }
4717                 if (data) {
4718                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4719                         pars++;
4720                     if (data_fake.flags & SF_HAS_EVAL)
4721                         data->flags |= SF_HAS_EVAL;
4722                     data->whilem_c = data_fake.whilem_c;
4723                 }
4724                 if (f & SCF_DO_STCLASS_AND) {
4725                     if (flags & SCF_DO_STCLASS_OR) {
4726                         /* OR before, AND after: ideally we would recurse with
4727                          * data_fake to get the AND applied by study of the
4728                          * remainder of the pattern, and then derecurse;
4729                          * *** HACK *** for now just treat as "no information".
4730                          * See [perl #56690].
4731                          */
4732                         ssc_init(pRExC_state, data->start_class);
4733                     }  else {
4734                         /* AND before and after: combine and continue */
4735                         ssc_and(pRExC_state, data->start_class, &intrnl);
4736                     }
4737                 }
4738             }
4739 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4740             else {
4741                 /* Positive Lookahead/lookbehind
4742                    In this case we can do fixed string optimisation,
4743                    but we must be careful about it. Note in the case of
4744                    lookbehind the positions will be offset by the minimum
4745                    length of the pattern, something we won't know about
4746                    until after the recurse.
4747                 */
4748                 SSize_t deltanext, fake = 0;
4749                 regnode *nscan;
4750                 regnode_ssc intrnl;
4751                 int f = 0;
4752                 /* We use SAVEFREEPV so that when the full compile
4753                     is finished perl will clean up the allocated
4754                     minlens when it's all done. This way we don't
4755                     have to worry about freeing them when we know
4756                     they wont be used, which would be a pain.
4757                  */
4758                 SSize_t *minnextp;
4759                 Newx( minnextp, 1, SSize_t );
4760                 SAVEFREEPV(minnextp);
4761
4762                 if (data) {
4763                     StructCopy(data, &data_fake, scan_data_t);
4764                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4765                         f |= SCF_DO_SUBSTR;
4766                         if (scan->flags)
4767                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4768                         data_fake.last_found=newSVsv(data->last_found);
4769                     }
4770                 }
4771                 else
4772                     data_fake.last_closep = &fake;
4773                 data_fake.flags = 0;
4774                 data_fake.pos_delta = delta;
4775                 if (is_inf)
4776                     data_fake.flags |= SF_IS_INF;
4777                 if ( flags & SCF_DO_STCLASS && !scan->flags
4778                      && OP(scan) == IFMATCH ) { /* Lookahead */
4779                     ssc_init(pRExC_state, &intrnl);
4780                     data_fake.start_class = &intrnl;
4781                     f |= SCF_DO_STCLASS_AND;
4782                 }
4783                 if (flags & SCF_WHILEM_VISITED_POS)
4784                     f |= SCF_WHILEM_VISITED_POS;
4785                 next = regnext(scan);
4786                 nscan = NEXTOPER(NEXTOPER(scan));
4787
4788                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4789                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4790                 if (scan->flags) {
4791                     if (deltanext) {
4792                         FAIL("Variable length lookbehind not implemented");
4793                     }
4794                     else if (*minnextp > (I32)U8_MAX) {
4795                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4796                     }
4797                     scan->flags = (U8)*minnextp;
4798                 }
4799
4800                 *minnextp += min;
4801
4802                 if (f & SCF_DO_STCLASS_AND) {
4803                     ssc_and(pRExC_state, data->start_class, &intrnl);
4804                 }
4805                 if (data) {
4806                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4807                         pars++;
4808                     if (data_fake.flags & SF_HAS_EVAL)
4809                         data->flags |= SF_HAS_EVAL;
4810                     data->whilem_c = data_fake.whilem_c;
4811                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4812                         if (RExC_rx->minlen<*minnextp)
4813                             RExC_rx->minlen=*minnextp;
4814                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4815                         SvREFCNT_dec_NN(data_fake.last_found);
4816
4817                         if ( data_fake.minlen_fixed != minlenp )
4818                         {
4819                             data->offset_fixed= data_fake.offset_fixed;
4820                             data->minlen_fixed= data_fake.minlen_fixed;
4821                             data->lookbehind_fixed+= scan->flags;
4822                         }
4823                         if ( data_fake.minlen_float != minlenp )
4824                         {
4825                             data->minlen_float= data_fake.minlen_float;
4826                             data->offset_float_min=data_fake.offset_float_min;
4827                             data->offset_float_max=data_fake.offset_float_max;
4828                             data->lookbehind_float+= scan->flags;
4829                         }
4830                     }
4831                 }
4832             }
4833 #endif
4834         }
4835         else if (OP(scan) == OPEN) {
4836             if (stopparen != (I32)ARG(scan))
4837                 pars++;
4838         }
4839         else if (OP(scan) == CLOSE) {
4840             if (stopparen == (I32)ARG(scan)) {
4841                 break;
4842             }
4843             if ((I32)ARG(scan) == is_par) {
4844                 next = regnext(scan);
4845
4846                 if ( next && (OP(next) != WHILEM) && next < last)
4847                     is_par = 0;         /* Disable optimization */
4848             }
4849             if (data)
4850                 *(data->last_closep) = ARG(scan);
4851         }
4852         else if (OP(scan) == EVAL) {
4853                 if (data)
4854                     data->flags |= SF_HAS_EVAL;
4855         }
4856         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4857             if (flags & SCF_DO_SUBSTR) {
4858                 SCAN_COMMIT(pRExC_state,data,minlenp);
4859                 flags &= ~SCF_DO_SUBSTR;
4860             }
4861             if (data && OP(scan)==ACCEPT) {
4862                 data->flags |= SCF_SEEN_ACCEPT;
4863                 if (stopmin > min)
4864                     stopmin = min;
4865             }
4866         }
4867         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4868         {
4869                 if (flags & SCF_DO_SUBSTR) {
4870                     SCAN_COMMIT(pRExC_state,data,minlenp);
4871                     data->longest = &(data->longest_float);
4872                 }
4873                 is_inf = is_inf_internal = 1;
4874                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4875                     ssc_anything(data->start_class);
4876                 flags &= ~SCF_DO_STCLASS;
4877         }
4878         else if (OP(scan) == GPOS) {
4879             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4880                 !(delta || is_inf || (data && data->pos_delta)))
4881             {
4882                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4883                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4884                 if (RExC_rx->gofs < (STRLEN)min)
4885                     RExC_rx->gofs = min;
4886             } else {
4887                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4888                 RExC_rx->gofs = 0;
4889             }
4890         }
4891 #ifdef TRIE_STUDY_OPT
4892 #ifdef FULL_TRIE_STUDY
4893         else if (PL_regkind[OP(scan)] == TRIE) {
4894             /* NOTE - There is similar code to this block above for handling
4895                BRANCH nodes on the initial study.  If you change stuff here
4896                check there too. */
4897             regnode *trie_node= scan;
4898             regnode *tail= regnext(scan);
4899             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4900             SSize_t max1 = 0, min1 = SSize_t_MAX;
4901             regnode_ssc accum;
4902
4903             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4904                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4905             if (flags & SCF_DO_STCLASS)
4906                 ssc_init_zero(pRExC_state, &accum);
4907
4908             if (!trie->jump) {
4909                 min1= trie->minlen;
4910                 max1= trie->maxlen;
4911             } else {
4912                 const regnode *nextbranch= NULL;
4913                 U32 word;
4914
4915                 for ( word=1 ; word <= trie->wordcount ; word++)
4916                 {
4917                     SSize_t deltanext=0, minnext=0, f = 0, fake;
4918                     regnode_ssc this_class;
4919
4920                     data_fake.flags = 0;
4921                     if (data) {
4922                         data_fake.whilem_c = data->whilem_c;
4923                         data_fake.last_closep = data->last_closep;
4924                     }
4925                     else
4926                         data_fake.last_closep = &fake;
4927                     data_fake.pos_delta = delta;
4928                     if (flags & SCF_DO_STCLASS) {
4929                         ssc_init(pRExC_state, &this_class);
4930                         data_fake.start_class = &this_class;
4931                         f = SCF_DO_STCLASS_AND;
4932                     }
4933                     if (flags & SCF_WHILEM_VISITED_POS)
4934                         f |= SCF_WHILEM_VISITED_POS;
4935
4936                     if (trie->jump[word]) {
4937                         if (!nextbranch)
4938                             nextbranch = trie_node + trie->jump[0];
4939                         scan= trie_node + trie->jump[word];
4940                         /* We go from the jump point to the branch that follows
4941                            it. Note this means we need the vestigal unused branches
4942                            even though they arent otherwise used.
4943                          */
4944                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4945                             &deltanext, (regnode *)nextbranch, &data_fake,
4946                             stopparen, recursed, NULL, f,depth+1);
4947                     }
4948                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4949                         nextbranch= regnext((regnode*)nextbranch);
4950
4951                     if (min1 > (SSize_t)(minnext + trie->minlen))
4952                         min1 = minnext + trie->minlen;
4953                     if (deltanext == SSize_t_MAX) {
4954                         is_inf = is_inf_internal = 1;
4955                         max1 = SSize_t_MAX;
4956                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4957                         max1 = minnext + deltanext + trie->maxlen;
4958
4959                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4960                         pars++;
4961                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4962                         if ( stopmin > min + min1)
4963                             stopmin = min + min1;
4964                         flags &= ~SCF_DO_SUBSTR;
4965                         if (data)
4966                             data->flags |= SCF_SEEN_ACCEPT;
4967                     }
4968                     if (data) {
4969                         if (data_fake.flags & SF_HAS_EVAL)
4970                             data->flags |= SF_HAS_EVAL;
4971                         data->whilem_c = data_fake.whilem_c;
4972                     }
4973                     if (flags & SCF_DO_STCLASS)
4974                         ssc_or(pRExC_state, &accum, &this_class);
4975                 }
4976             }
4977             if (flags & SCF_DO_SUBSTR) {
4978                 data->pos_min += min1;
4979                 data->pos_delta += max1 - min1;
4980                 if (max1 != min1 || is_inf)
4981                     data->longest = &(data->longest_float);
4982             }
4983             min += min1;
4984             delta += max1 - min1;
4985             if (flags & SCF_DO_STCLASS_OR) {
4986                 ssc_or(pRExC_state, data->start_class, &accum);
4987                 if (min1) {
4988                     ssc_and(pRExC_state, data->start_class, and_withp);
4989                     flags &= ~SCF_DO_STCLASS;
4990                 }
4991             }
4992             else if (flags & SCF_DO_STCLASS_AND) {
4993                 if (min1) {
4994                     ssc_and(pRExC_state, data->start_class, &accum);
4995                     flags &= ~SCF_DO_STCLASS;
4996                 }
4997                 else {
4998                     /* Switch to OR mode: cache the old value of
4999                      * data->start_class */
5000                     INIT_AND_WITHP;
5001                     StructCopy(data->start_class, and_withp, regnode_ssc);
5002                     flags &= ~SCF_DO_STCLASS_AND;
5003                     StructCopy(&accum, data->start_class, regnode_ssc);
5004                     flags |= SCF_DO_STCLASS_OR;
5005                 }
5006             }
5007             scan= tail;
5008             continue;
5009         }
5010 #else
5011         else if (PL_regkind[OP(scan)] == TRIE) {
5012             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5013             U8*bang=NULL;
5014
5015             min += trie->minlen;
5016             delta += (trie->maxlen - trie->minlen);
5017             flags &= ~SCF_DO_STCLASS; /* xxx */
5018             if (flags & SCF_DO_SUBSTR) {
5019                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
5020                 data->pos_min += trie->minlen;
5021                 data->pos_delta += (trie->maxlen - trie->minlen);
5022                 if (trie->maxlen != trie->minlen)
5023                     data->longest = &(data->longest_float);
5024             }
5025             if (trie->jump) /* no more substrings -- for now /grr*/
5026                 flags &= ~SCF_DO_SUBSTR;
5027         }
5028 #endif /* old or new */
5029 #endif /* TRIE_STUDY_OPT */
5030
5031         /* Else: zero-length, ignore. */
5032         scan = regnext(scan);
5033     }
5034     if (frame) {
5035         last = frame->last;
5036         scan = frame->next;
5037         stopparen = frame->stop;
5038         frame = frame->prev;
5039         goto fake_study_recurse;
5040     }
5041
5042   finish:
5043     assert(!frame);
5044     DEBUG_STUDYDATA("pre-fin:",data,depth);
5045
5046     *scanp = scan;
5047     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5048     if (flags & SCF_DO_SUBSTR && is_inf)
5049         data->pos_delta = SSize_t_MAX - data->pos_min;
5050     if (is_par > (I32)U8_MAX)
5051         is_par = 0;
5052     if (is_par && pars==1 && data) {
5053         data->flags |= SF_IN_PAR;
5054         data->flags &= ~SF_HAS_PAR;
5055     }
5056     else if (pars && data) {
5057         data->flags |= SF_HAS_PAR;
5058         data->flags &= ~SF_IN_PAR;
5059     }
5060     if (flags & SCF_DO_STCLASS_OR)
5061         ssc_and(pRExC_state, data->start_class, and_withp);
5062     if (flags & SCF_TRIE_RESTUDY)
5063         data->flags |=  SCF_TRIE_RESTUDY;
5064
5065     DEBUG_STUDYDATA("post-fin:",data,depth);
5066
5067     return min < stopmin ? min : stopmin;
5068 }
5069
5070 STATIC U32
5071 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5072 {
5073     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5074
5075     PERL_ARGS_ASSERT_ADD_DATA;
5076
5077     Renewc(RExC_rxi->data,
5078            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5079            char, struct reg_data);
5080     if(count)
5081         Renew(RExC_rxi->data->what, count + n, U8);
5082     else
5083         Newx(RExC_rxi->data->what, n, U8);
5084     RExC_rxi->data->count = count + n;
5085     Copy(s, RExC_rxi->data->what + count, n, U8);
5086     return count;
5087 }
5088
5089 /*XXX: todo make this not included in a non debugging perl */
5090 #ifndef PERL_IN_XSUB_RE
5091 void
5092 Perl_reginitcolors(pTHX)
5093 {
5094     dVAR;
5095     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5096     if (s) {
5097         char *t = savepv(s);
5098         int i = 0;
5099         PL_colors[0] = t;
5100         while (++i < 6) {
5101             t = strchr(t, '\t');
5102             if (t) {
5103                 *t = '\0';
5104                 PL_colors[i] = ++t;
5105             }
5106             else
5107                 PL_colors[i] = t = (char *)"";
5108         }
5109     } else {
5110         int i = 0;
5111         while (i < 6)
5112             PL_colors[i++] = (char *)"";
5113     }
5114     PL_colorset = 1;
5115 }
5116 #endif
5117
5118
5119 #ifdef TRIE_STUDY_OPT
5120 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5121     STMT_START {                                            \
5122         if (                                                \
5123               (data.flags & SCF_TRIE_RESTUDY)               \
5124               && ! restudied++                              \
5125         ) {                                                 \
5126             dOsomething;                                    \
5127             goto reStudy;                                   \
5128         }                                                   \
5129     } STMT_END
5130 #else
5131 #define CHECK_RESTUDY_GOTO_butfirst
5132 #endif
5133
5134 /*
5135  * pregcomp - compile a regular expression into internal code
5136  *
5137  * Decides which engine's compiler to call based on the hint currently in
5138  * scope
5139  */
5140
5141 #ifndef PERL_IN_XSUB_RE
5142
5143 /* return the currently in-scope regex engine (or the default if none)  */
5144
5145 regexp_engine const *
5146 Perl_current_re_engine(pTHX)
5147 {
5148     dVAR;
5149
5150     if (IN_PERL_COMPILETIME) {
5151         HV * const table = GvHV(PL_hintgv);
5152         SV **ptr;
5153
5154         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5155             return &PL_core_reg_engine;
5156         ptr = hv_fetchs(table, "regcomp", FALSE);
5157         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5158             return &PL_core_reg_engine;
5159         return INT2PTR(regexp_engine*,SvIV(*ptr));
5160     }
5161     else {
5162         SV *ptr;
5163         if (!PL_curcop->cop_hints_hash)
5164             return &PL_core_reg_engine;
5165         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5166         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5167             return &PL_core_reg_engine;
5168         return INT2PTR(regexp_engine*,SvIV(ptr));
5169     }
5170 }
5171
5172
5173 REGEXP *
5174 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5175 {
5176     dVAR;
5177     regexp_engine const *eng = current_re_engine();
5178     GET_RE_DEBUG_FLAGS_DECL;
5179
5180     PERL_ARGS_ASSERT_PREGCOMP;
5181
5182     /* Dispatch a request to compile a regexp to correct regexp engine. */
5183     DEBUG_COMPILE_r({
5184         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5185                         PTR2UV(eng));
5186     });
5187     return CALLREGCOMP_ENG(eng, pattern, flags);
5188 }
5189 #endif
5190
5191 /* public(ish) entry point for the perl core's own regex compiling code.
5192  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5193  * pattern rather than a list of OPs, and uses the internal engine rather
5194  * than the current one */
5195
5196 REGEXP *
5197 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5198 {
5199     SV *pat = pattern; /* defeat constness! */
5200     PERL_ARGS_ASSERT_RE_COMPILE;
5201     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5202 #ifdef PERL_IN_XSUB_RE
5203                                 &my_reg_engine,
5204 #else
5205                                 &PL_core_reg_engine,
5206 #endif
5207                                 NULL, NULL, rx_flags, 0);
5208 }
5209
5210
5211 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5212  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5213  * point to the realloced string and length.
5214  *
5215  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5216  * stuff added */
5217
5218 static void
5219 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5220                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5221 {
5222     U8 *const src = (U8*)*pat_p;
5223     U8 *dst;
5224     int n=0;
5225     STRLEN s = 0, d = 0;
5226     bool do_end = 0;
5227     GET_RE_DEBUG_FLAGS_DECL;
5228
5229     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5230         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5231
5232     Newx(dst, *plen_p * 2 + 1, U8);
5233
5234     while (s < *plen_p) {
5235         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5236             dst[d]   = src[s];
5237         else {
5238             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5239             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5240         }
5241         if (n < num_code_blocks) {
5242             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5243                 pRExC_state->code_blocks[n].start = d;
5244                 assert(dst[d] == '(');
5245                 do_end = 1;
5246             }
5247             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5248                 pRExC_state->code_blocks[n].end = d;
5249                 assert(dst[d] == ')');
5250                 do_end = 0;
5251                 n++;
5252             }
5253         }
5254         s++;
5255         d++;
5256     }
5257     dst[d] = '\0';
5258     *plen_p = d;
5259     *pat_p = (char*) dst;
5260     SAVEFREEPV(*pat_p);
5261     RExC_orig_utf8 = RExC_utf8 = 1;
5262 }
5263
5264
5265
5266 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5267  * while recording any code block indices, and handling overloading,
5268  * nested qr// objects etc.  If pat is null, it will allocate a new
5269  * string, or just return the first arg, if there's only one.
5270  *
5271  * Returns the malloced/updated pat.
5272  * patternp and pat_count is the array of SVs to be concatted;
5273  * oplist is the optional list of ops that generated the SVs;
5274  * recompile_p is a pointer to a boolean that will be set if
5275  *   the regex will need to be recompiled.
5276  * delim, if non-null is an SV that will be inserted between each element
5277  */
5278
5279 static SV*
5280 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5281                 SV *pat, SV ** const patternp, int pat_count,
5282                 OP *oplist, bool *recompile_p, SV *delim)
5283 {
5284     SV **svp;
5285     int n = 0;
5286     bool use_delim = FALSE;
5287     bool alloced = FALSE;
5288
5289     /* if we know we have at least two args, create an empty string,
5290      * then concatenate args to that. For no args, return an empty string */
5291     if (!pat && pat_count != 1) {
5292         pat = newSVpvn("", 0);
5293         SAVEFREESV(pat);
5294         alloced = TRUE;
5295     }
5296
5297     for (svp = patternp; svp < patternp + pat_count; svp++) {
5298         SV *sv;
5299         SV *rx  = NULL;
5300         STRLEN orig_patlen = 0;
5301         bool code = 0;
5302         SV *msv = use_delim ? delim : *svp;
5303         if (!msv) msv = &PL_sv_undef;
5304
5305         /* if we've got a delimiter, we go round the loop twice for each
5306          * svp slot (except the last), using the delimiter the second
5307          * time round */
5308         if (use_delim) {
5309             svp--;
5310             use_delim = FALSE;
5311         }
5312         else if (delim)
5313             use_delim = TRUE;
5314
5315         if (SvTYPE(msv) == SVt_PVAV) {
5316             /* we've encountered an interpolated array within
5317              * the pattern, e.g. /...@a..../. Expand the list of elements,
5318              * then recursively append elements.
5319              * The code in this block is based on S_pushav() */
5320
5321             AV *const av = (AV*)msv;
5322             const SSize_t maxarg = AvFILL(av) + 1;
5323             SV **array;
5324
5325             if (oplist) {
5326                 assert(oplist->op_type == OP_PADAV
5327                     || oplist->op_type == OP_RV2AV);
5328                 oplist = oplist->op_sibling;;
5329             }
5330
5331             if (SvRMAGICAL(av)) {
5332                 SSize_t i;
5333
5334                 Newx(array, maxarg, SV*);
5335                 SAVEFREEPV(array);
5336                 for (i=0; i < maxarg; i++) {
5337                     SV ** const svp = av_fetch(av, i, FALSE);
5338                     array[i] = svp ? *svp : &PL_sv_undef;
5339                 }
5340             }
5341             else
5342                 array = AvARRAY(av);
5343
5344             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5345                                 array, maxarg, NULL, recompile_p,
5346                                 /* $" */
5347                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5348
5349             continue;
5350         }
5351
5352
5353         /* we make the assumption here that each op in the list of
5354          * op_siblings maps to one SV pushed onto the stack,
5355          * except for code blocks, with have both an OP_NULL and
5356          * and OP_CONST.
5357          * This allows us to match up the list of SVs against the
5358          * list of OPs to find the next code block.
5359          *
5360          * Note that       PUSHMARK PADSV PADSV ..
5361          * is optimised to
5362          *                 PADRANGE PADSV  PADSV  ..
5363          * so the alignment still works. */
5364
5365         if (oplist) {
5366             if (oplist->op_type == OP_NULL
5367                 && (oplist->op_flags & OPf_SPECIAL))
5368             {
5369                 assert(n < pRExC_state->num_code_blocks);
5370                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5371                 pRExC_state->code_blocks[n].block = oplist;
5372                 pRExC_state->code_blocks[n].src_regex = NULL;
5373                 n++;
5374                 code = 1;
5375                 oplist = oplist->op_sibling; /* skip CONST */
5376                 assert(oplist);
5377             }
5378             oplist = oplist->op_sibling;;
5379         }
5380
5381         /* apply magic and QR overloading to arg */
5382
5383         SvGETMAGIC(msv);
5384         if (SvROK(msv) && SvAMAGIC(msv)) {
5385             SV *sv = AMG_CALLunary(msv, regexp_amg);
5386             if (sv) {
5387                 if (SvROK(sv))
5388                     sv = SvRV(sv);
5389                 if (SvTYPE(sv) != SVt_REGEXP)
5390                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5391                 msv = sv;
5392             }
5393         }
5394
5395         /* try concatenation overload ... */
5396         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5397                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5398         {
5399             sv_setsv(pat, sv);
5400             /* overloading involved: all bets are off over literal
5401              * code. Pretend we haven't seen it */
5402             pRExC_state->num_code_blocks -= n;
5403             n = 0;
5404         }
5405         else  {
5406             /* ... or failing that, try "" overload */
5407             while (SvAMAGIC(msv)
5408                     && (sv = AMG_CALLunary(msv, string_amg))
5409                     && sv != msv
5410                     &&  !(   SvROK(msv)
5411                           && SvROK(sv)
5412                           && SvRV(msv) == SvRV(sv))
5413             ) {
5414                 msv = sv;
5415                 SvGETMAGIC(msv);
5416             }
5417             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5418                 msv = SvRV(msv);
5419
5420             if (pat) {
5421                 /* this is a partially unrolled
5422                  *     sv_catsv_nomg(pat, msv);
5423                  * that allows us to adjust code block indices if
5424                  * needed */
5425                 STRLEN dlen;
5426                 char *dst = SvPV_force_nomg(pat, dlen);
5427                 orig_patlen = dlen;
5428                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5429                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5430                     sv_setpvn(pat, dst, dlen);
5431                     SvUTF8_on(pat);
5432                 }
5433                 sv_catsv_nomg(pat, msv);
5434                 rx = msv;
5435             }
5436             else
5437                 pat = msv;
5438
5439             if (code)
5440                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5441         }
5442
5443         /* extract any code blocks within any embedded qr//'s */
5444         if (rx && SvTYPE(rx) == SVt_REGEXP
5445             && RX_ENGINE((REGEXP*)rx)->op_comp)
5446         {
5447
5448             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5449             if (ri->num_code_blocks) {
5450                 int i;
5451                 /* the presence of an embedded qr// with code means
5452                  * we should always recompile: the text of the
5453                  * qr// may not have changed, but it may be a
5454                  * different closure than last time */
5455                 *recompile_p = 1;
5456                 Renew(pRExC_state->code_blocks,
5457                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5458                     struct reg_code_block);
5459                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5460
5461                 for (i=0; i < ri->num_code_blocks; i++) {
5462                     struct reg_code_block *src, *dst;
5463                     STRLEN offset =  orig_patlen
5464                         + ReANY((REGEXP *)rx)->pre_prefix;
5465                     assert(n < pRExC_state->num_code_blocks);
5466                     src = &ri->code_blocks[i];
5467                     dst = &pRExC_state->code_blocks[n];
5468                     dst->start      = src->start + offset;
5469                     dst->end        = src->end   + offset;
5470                     dst->block      = src->block;
5471                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5472                                             src->src_regex
5473                                                 ? src->src_regex
5474                                                 : (REGEXP*)rx);
5475                     n++;
5476                 }
5477             }
5478         }
5479     }
5480     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5481     if (alloced)
5482         SvSETMAGIC(pat);
5483
5484     return pat;
5485 }
5486
5487
5488
5489 /* see if there are any run-time code blocks in the pattern.
5490  * False positives are allowed */
5491
5492 static bool
5493 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5494                     char *pat, STRLEN plen)
5495 {
5496     int n = 0;
5497     STRLEN s;
5498
5499     for (s = 0; s < plen; s++) {
5500         if (n < pRExC_state->num_code_blocks
5501             && s == pRExC_state->code_blocks[n].start)
5502         {
5503             s = pRExC_state->code_blocks[n].end;
5504             n++;
5505             continue;
5506         }
5507         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5508          * positives here */
5509         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5510             (pat[s+2] == '{'
5511                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5512         )
5513             return 1;
5514     }
5515     return 0;
5516 }
5517
5518 /* Handle run-time code blocks. We will already have compiled any direct
5519  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5520  * copy of it, but with any literal code blocks blanked out and
5521  * appropriate chars escaped; then feed it into
5522  *
5523  *    eval "qr'modified_pattern'"
5524  *
5525  * For example,
5526  *
5527  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5528  *
5529  * becomes
5530  *
5531  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5532  *
5533  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5534  * and merge them with any code blocks of the original regexp.
5535  *
5536  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5537  * instead, just save the qr and return FALSE; this tells our caller that
5538  * the original pattern needs upgrading to utf8.
5539  */
5540
5541 static bool
5542 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5543     char *pat, STRLEN plen)
5544 {
5545     SV *qr;
5546
5547     GET_RE_DEBUG_FLAGS_DECL;
5548
5549     if (pRExC_state->runtime_code_qr) {
5550         /* this is the second time we've been called; this should
5551          * only happen if the main pattern got upgraded to utf8
5552          * during compilation; re-use the qr we compiled first time
5553          * round (which should be utf8 too)
5554          */
5555         qr = pRExC_state->runtime_code_qr;
5556         pRExC_state->runtime_code_qr = NULL;
5557         assert(RExC_utf8 && SvUTF8(qr));
5558     }
5559     else {
5560         int n = 0;
5561         STRLEN s;
5562         char *p, *newpat;
5563         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5564         SV *sv, *qr_ref;
5565         dSP;
5566
5567         /* determine how many extra chars we need for ' and \ escaping */
5568         for (s = 0; s < plen; s++) {
5569             if (pat[s] == '\'' || pat[s] == '\\')
5570                 newlen++;
5571         }
5572
5573         Newx(newpat, newlen, char);
5574         p = newpat;
5575         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5576
5577         for (s = 0; s < plen; s++) {
5578             if (n < pRExC_state->num_code_blocks
5579                 && s == pRExC_state->code_blocks[n].start)
5580             {
5581                 /* blank out literal code block */
5582                 assert(pat[s] == '(');
5583                 while (s <= pRExC_state->code_blocks[n].end) {
5584                     *p++ = '_';
5585                     s++;
5586                 }
5587                 s--;
5588                 n++;
5589                 continue;
5590             }
5591             if (pat[s] == '\'' || pat[s] == '\\')
5592                 *p++ = '\\';
5593             *p++ = pat[s];
5594         }
5595         *p++ = '\'';
5596         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5597             *p++ = 'x';
5598         *p++ = '\0';
5599         DEBUG_COMPILE_r({
5600             PerlIO_printf(Perl_debug_log,
5601                 "%sre-parsing pattern for runtime code:%s %s\n",
5602                 PL_colors[4],PL_colors[5],newpat);
5603         });
5604
5605         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5606         Safefree(newpat);
5607
5608         ENTER;
5609         SAVETMPS;
5610         save_re_context();
5611         PUSHSTACKi(PERLSI_REQUIRE);
5612         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5613          * parsing qr''; normally only q'' does this. It also alters
5614          * hints handling */
5615         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5616         SvREFCNT_dec_NN(sv);
5617         SPAGAIN;
5618         qr_ref = POPs;
5619         PUTBACK;
5620         {
5621             SV * const errsv = ERRSV;
5622             if (SvTRUE_NN(errsv))
5623             {
5624                 Safefree(pRExC_state->code_blocks);
5625                 /* use croak_sv ? */
5626                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5627             }
5628         }
5629         assert(SvROK(qr_ref));
5630         qr = SvRV(qr_ref);
5631         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5632         /* the leaving below frees the tmp qr_ref.
5633          * Give qr a life of its own */
5634         SvREFCNT_inc(qr);
5635         POPSTACK;
5636         FREETMPS;
5637         LEAVE;
5638
5639     }
5640
5641     if (!RExC_utf8 && SvUTF8(qr)) {
5642         /* first time through; the pattern got upgraded; save the
5643          * qr for the next time through */
5644         assert(!pRExC_state->runtime_code_qr);
5645         pRExC_state->runtime_code_qr = qr;
5646         return 0;
5647     }
5648
5649
5650     /* extract any code blocks within the returned qr//  */
5651
5652
5653     /* merge the main (r1) and run-time (r2) code blocks into one */
5654     {
5655         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5656         struct reg_code_block *new_block, *dst;
5657         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5658         int i1 = 0, i2 = 0;
5659
5660         if (!r2->num_code_blocks) /* we guessed wrong */
5661         {
5662             SvREFCNT_dec_NN(qr);
5663             return 1;
5664         }
5665
5666         Newx(new_block,
5667             r1->num_code_blocks + r2->num_code_blocks,
5668             struct reg_code_block);
5669         dst = new_block;
5670
5671         while (    i1 < r1->num_code_blocks
5672                 || i2 < r2->num_code_blocks)
5673         {
5674             struct reg_code_block *src;
5675             bool is_qr = 0;
5676
5677             if (i1 == r1->num_code_blocks) {
5678                 src = &r2->code_blocks[i2++];
5679                 is_qr = 1;
5680             }
5681             else if (i2 == r2->num_code_blocks)
5682                 src = &r1->code_blocks[i1++];
5683             else if (  r1->code_blocks[i1].start
5684                      < r2->code_blocks[i2].start)
5685             {
5686                 src = &r1->code_blocks[i1++];
5687                 assert(src->end < r2->code_blocks[i2].start);
5688             }
5689             else {
5690                 assert(  r1->code_blocks[i1].start
5691                        > r2->code_blocks[i2].start);
5692                 src = &r2->code_blocks[i2++];
5693                 is_qr = 1;
5694                 assert(src->end < r1->code_blocks[i1].start);
5695             }
5696
5697             assert(pat[src->start] == '(');
5698             assert(pat[src->end]   == ')');
5699             dst->start      = src->start;
5700             dst->end        = src->end;
5701             dst->block      = src->block;
5702             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5703                                     : src->src_regex;
5704             dst++;
5705         }
5706         r1->num_code_blocks += r2->num_code_blocks;
5707         Safefree(r1->code_blocks);
5708         r1->code_blocks = new_block;
5709     }
5710
5711     SvREFCNT_dec_NN(qr);
5712     return 1;
5713 }
5714
5715
5716 STATIC bool
5717 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5718                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5719 {
5720     /* This is the common code for setting up the floating and fixed length
5721      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5722      * as to whether succeeded or not */
5723
5724     I32 t;
5725     SSize_t ml;
5726
5727     if (! (longest_length
5728            || (eol /* Can't have SEOL and MULTI */
5729                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5730           )
5731             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5732         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5733     {
5734         return FALSE;
5735     }
5736
5737     /* copy the information about the longest from the reg_scan_data
5738         over to the program. */
5739     if (SvUTF8(sv_longest)) {
5740         *rx_utf8 = sv_longest;
5741         *rx_substr = NULL;
5742     } else {
5743         *rx_substr = sv_longest;
5744         *rx_utf8 = NULL;
5745     }
5746     /* end_shift is how many chars that must be matched that
5747         follow this item. We calculate it ahead of time as once the
5748         lookbehind offset is added in we lose the ability to correctly
5749         calculate it.*/
5750     ml = minlen ? *(minlen) : (SSize_t)longest_length;
5751     *rx_end_shift = ml - offset
5752         - longest_length + (SvTAIL(sv_longest) != 0)
5753         + lookbehind;
5754
5755     t = (eol/* Can't have SEOL and MULTI */
5756          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5757     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5758
5759     return TRUE;
5760 }
5761
5762 /*
5763  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5764  * regular expression into internal code.
5765  * The pattern may be passed either as:
5766  *    a list of SVs (patternp plus pat_count)
5767  *    a list of OPs (expr)
5768  * If both are passed, the SV list is used, but the OP list indicates
5769  * which SVs are actually pre-compiled code blocks
5770  *
5771  * The SVs in the list have magic and qr overloading applied to them (and
5772  * the list may be modified in-place with replacement SVs in the latter
5773  * case).
5774  *
5775  * If the pattern hasn't changed from old_re, then old_re will be
5776  * returned.
5777  *
5778  * eng is the current engine. If that engine has an op_comp method, then
5779  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5780  * do the initial concatenation of arguments and pass on to the external
5781  * engine.
5782  *
5783  * If is_bare_re is not null, set it to a boolean indicating whether the
5784  * arg list reduced (after overloading) to a single bare regex which has
5785  * been returned (i.e. /$qr/).
5786  *
5787  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5788  *
5789  * pm_flags contains the PMf_* flags, typically based on those from the
5790  * pm_flags field of the related PMOP. Currently we're only interested in
5791  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5792  *
5793  * We can't allocate space until we know how big the compiled form will be,
5794  * but we can't compile it (and thus know how big it is) until we've got a
5795  * place to put the code.  So we cheat:  we compile it twice, once with code
5796  * generation turned off and size counting turned on, and once "for real".
5797  * This also means that we don't allocate space until we are sure that the
5798  * thing really will compile successfully, and we never have to move the
5799  * code and thus invalidate pointers into it.  (Note that it has to be in
5800  * one piece because free() must be able to free it all.) [NB: not true in perl]
5801  *
5802  * Beware that the optimization-preparation code in here knows about some
5803  * of the structure of the compiled regexp.  [I'll say.]
5804  */
5805
5806 REGEXP *
5807 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5808                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5809                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5810 {
5811     dVAR;
5812     REGEXP *rx;
5813     struct regexp *r;
5814     regexp_internal *ri;
5815     STRLEN plen;
5816     char *exp;
5817     regnode *scan;
5818     I32 flags;
5819     SSize_t minlen = 0;
5820     U32 rx_flags;
5821     SV *pat;
5822     SV *code_blocksv = NULL;
5823     SV** new_patternp = patternp;
5824
5825     /* these are all flags - maybe they should be turned
5826      * into a single int with different bit masks */
5827     I32 sawlookahead = 0;
5828     I32 sawplus = 0;
5829     I32 sawopen = 0;
5830     I32 sawminmod = 0;
5831
5832     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5833     bool recompile = 0;
5834     bool runtime_code = 0;
5835     scan_data_t data;
5836     RExC_state_t RExC_state;
5837     RExC_state_t * const pRExC_state = &RExC_state;
5838 #ifdef TRIE_STUDY_OPT
5839     int restudied = 0;
5840     RExC_state_t copyRExC_state;
5841 #endif
5842     GET_RE_DEBUG_FLAGS_DECL;
5843
5844     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5845
5846     DEBUG_r(if (!PL_colorset) reginitcolors());
5847
5848 #ifndef PERL_IN_XSUB_RE
5849     /* Initialize these here instead of as-needed, as is quick and avoids
5850      * having to test them each time otherwise */
5851     if (! PL_AboveLatin1) {
5852         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5853         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5854         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
5855
5856         PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5857         PL_L1Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5858         PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(ASCII_invlist);
5859
5860         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5861                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5862         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5863                                 = _new_invlist_C_array(PosixAlnum_invlist);
5864
5865         PL_L1Posix_ptrs[_CC_ALPHA]
5866                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5867         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5868
5869         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5870         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5871
5872         /* Cased is the same as Alpha in the ASCII range */
5873         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5874         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5875
5876         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5877         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5878
5879         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5880         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5881
5882         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5883         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5884
5885         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5886         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5887
5888         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5889         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5890
5891         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5892         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5893
5894         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5895         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5896         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5897         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5898
5899         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5900         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5901
5902         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5903
5904         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5905         PL_L1Posix_ptrs[_CC_WORDCHAR]
5906                                 = _new_invlist_C_array(L1PosixWord_invlist);
5907
5908         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5909         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5910
5911         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5912     }
5913 #endif
5914
5915     pRExC_state->code_blocks = NULL;
5916     pRExC_state->num_code_blocks = 0;
5917
5918     if (is_bare_re)
5919         *is_bare_re = FALSE;
5920
5921     if (expr && (expr->op_type == OP_LIST ||
5922                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5923         /* allocate code_blocks if needed */
5924         OP *o;
5925         int ncode = 0;
5926
5927         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5928             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5929                 ncode++; /* count of DO blocks */
5930         if (ncode) {
5931             pRExC_state->num_code_blocks = ncode;
5932             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5933         }
5934     }
5935
5936     if (!pat_count) {
5937         /* compile-time pattern with just OP_CONSTs and DO blocks */
5938
5939         int n;
5940         OP *o;
5941
5942         /* find how many CONSTs there are */
5943         assert(expr);
5944         n = 0;
5945         if (expr->op_type == OP_CONST)
5946             n = 1;
5947         else
5948             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5949                 if (o->op_type == OP_CONST)
5950                     n++;
5951             }
5952
5953         /* fake up an SV array */
5954
5955         assert(!new_patternp);
5956         Newx(new_patternp, n, SV*);
5957         SAVEFREEPV(new_patternp);
5958         pat_count = n;
5959
5960         n = 0;
5961         if (expr->op_type == OP_CONST)
5962             new_patternp[n] = cSVOPx_sv(expr);
5963         else
5964             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5965                 if (o->op_type == OP_CONST)
5966                     new_patternp[n++] = cSVOPo_sv;
5967             }
5968
5969     }
5970
5971     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5972         "Assembling pattern from %d elements%s\n", pat_count,
5973             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5974
5975     /* set expr to the first arg op */
5976
5977     if (pRExC_state->num_code_blocks
5978          && expr->op_type != OP_CONST)
5979     {
5980             expr = cLISTOPx(expr)->op_first;
5981             assert(   expr->op_type == OP_PUSHMARK
5982                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5983                    || expr->op_type == OP_PADRANGE);
5984             expr = expr->op_sibling;
5985     }
5986
5987     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5988                         expr, &recompile, NULL);
5989
5990     /* handle bare (possibly after overloading) regex: foo =~ $re */
5991     {
5992         SV *re = pat;
5993         if (SvROK(re))
5994             re = SvRV(re);
5995         if (SvTYPE(re) == SVt_REGEXP) {
5996             if (is_bare_re)
5997                 *is_bare_re = TRUE;
5998             SvREFCNT_inc(re);
5999             Safefree(pRExC_state->code_blocks);
6000             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6001                 "Precompiled pattern%s\n",
6002                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6003
6004             return (REGEXP*)re;
6005         }
6006     }
6007
6008     exp = SvPV_nomg(pat, plen);
6009
6010     if (!eng->op_comp) {
6011         if ((SvUTF8(pat) && IN_BYTES)
6012                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6013         {
6014             /* make a temporary copy; either to convert to bytes,
6015              * or to avoid repeating get-magic / overloaded stringify */
6016             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6017                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6018         }
6019         Safefree(pRExC_state->code_blocks);
6020         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6021     }
6022
6023     /* ignore the utf8ness if the pattern is 0 length */
6024     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6025     RExC_uni_semantics = 0;
6026     RExC_contains_locale = 0;
6027     RExC_contains_i = 0;
6028     pRExC_state->runtime_code_qr = NULL;
6029
6030     DEBUG_COMPILE_r({
6031             SV *dsv= sv_newmortal();
6032             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6033             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6034                           PL_colors[4],PL_colors[5],s);
6035         });
6036
6037   redo_first_pass:
6038     /* we jump here if we upgrade the pattern to utf8 and have to
6039      * recompile */
6040
6041     if ((pm_flags & PMf_USE_RE_EVAL)
6042                 /* this second condition covers the non-regex literal case,
6043                  * i.e.  $foo =~ '(?{})'. */
6044                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6045     )
6046         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6047
6048     /* return old regex if pattern hasn't changed */
6049     /* XXX: note in the below we have to check the flags as well as the pattern.
6050      *
6051      * Things get a touch tricky as we have to compare the utf8 flag independently
6052      * from the compile flags.
6053      */
6054
6055     if (   old_re
6056         && !recompile
6057         && !!RX_UTF8(old_re) == !!RExC_utf8
6058         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6059         && RX_PRECOMP(old_re)
6060         && RX_PRELEN(old_re) == plen
6061         && memEQ(RX_PRECOMP(old_re), exp, plen)
6062         && !runtime_code /* with runtime code, always recompile */ )
6063     {
6064         Safefree(pRExC_state->code_blocks);
6065         return old_re;
6066     }
6067
6068     rx_flags = orig_rx_flags;
6069
6070     if (rx_flags & PMf_FOLD) {
6071         RExC_contains_i = 1;
6072     }
6073     if (initial_charset == REGEX_LOCALE_CHARSET) {
6074         RExC_contains_locale = 1;
6075     }
6076     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6077
6078         /* Set to use unicode semantics if the pattern is in utf8 and has the
6079          * 'depends' charset specified, as it means unicode when utf8  */
6080         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6081     }
6082
6083     RExC_precomp = exp;
6084     RExC_flags = rx_flags;
6085     RExC_pm_flags = pm_flags;
6086
6087     if (runtime_code) {
6088         if (TAINTING_get && TAINT_get)
6089             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6090
6091         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6092             /* whoops, we have a non-utf8 pattern, whilst run-time code
6093              * got compiled as utf8. Try again with a utf8 pattern */
6094             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6095                                     pRExC_state->num_code_blocks);
6096             goto redo_first_pass;
6097         }
6098     }
6099     assert(!pRExC_state->runtime_code_qr);
6100
6101     RExC_sawback = 0;
6102
6103     RExC_seen = 0;
6104     RExC_in_lookbehind = 0;
6105     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6106     RExC_extralen = 0;
6107     RExC_override_recoding = 0;
6108     RExC_in_multi_char_class = 0;
6109
6110     /* First pass: determine size, legality. */
6111     RExC_parse = exp;
6112     RExC_start = exp;
6113     RExC_end = exp + plen;
6114     RExC_naughty = 0;
6115     RExC_npar = 1;
6116     RExC_nestroot = 0;
6117     RExC_size = 0L;
6118     RExC_emit = (regnode *) &RExC_emit_dummy;
6119     RExC_whilem_seen = 0;
6120     RExC_open_parens = NULL;
6121     RExC_close_parens = NULL;
6122     RExC_opend = NULL;
6123     RExC_paren_names = NULL;
6124 #ifdef DEBUGGING
6125     RExC_paren_name_list = NULL;
6126 #endif
6127     RExC_recurse = NULL;
6128     RExC_recurse_count = 0;
6129     pRExC_state->code_index = 0;
6130
6131 #if 0 /* REGC() is (currently) a NOP at the first pass.
6132        * Clever compilers notice this and complain. --jhi */
6133     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6134 #endif
6135     DEBUG_PARSE_r(
6136         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6137         RExC_lastnum=0;
6138         RExC_lastparse=NULL;
6139     );
6140     /* reg may croak on us, not giving us a chance to free
6141        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6142        need it to survive as long as the regexp (qr/(?{})/).
6143        We must check that code_blocksv is not already set, because we may
6144        have jumped back to restart the sizing pass. */
6145     if (pRExC_state->code_blocks && !code_blocksv) {
6146         code_blocksv = newSV_type(SVt_PV);
6147         SAVEFREESV(code_blocksv);
6148         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6149         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6150     }
6151     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6152         /* It's possible to write a regexp in ascii that represents Unicode
6153         codepoints outside of the byte range, such as via \x{100}. If we
6154         detect such a sequence we have to convert the entire pattern to utf8
6155         and then recompile, as our sizing calculation will have been based
6156         on 1 byte == 1 character, but we will need to use utf8 to encode
6157         at least some part of the pattern, and therefore must convert the whole
6158         thing.
6159         -- dmq */
6160         if (flags & RESTART_UTF8) {
6161             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6162                                     pRExC_state->num_code_blocks);
6163             goto redo_first_pass;
6164         }
6165         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6166     }
6167     if (code_blocksv)
6168         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6169
6170     DEBUG_PARSE_r({
6171         PerlIO_printf(Perl_debug_log,
6172             "Required size %"IVdf" nodes\n"
6173             "Starting second pass (creation)\n",
6174             (IV)RExC_size);
6175         RExC_lastnum=0;
6176         RExC_lastparse=NULL;
6177     });
6178
6179     /* The first pass could have found things that force Unicode semantics */
6180     if ((RExC_utf8 || RExC_uni_semantics)
6181          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6182     {
6183         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6184     }
6185
6186     /* Small enough for pointer-storage convention?
6187        If extralen==0, this means that we will not need long jumps. */
6188     if (RExC_size >= 0x10000L && RExC_extralen)
6189         RExC_size += RExC_extralen;
6190     else
6191         RExC_extralen = 0;
6192     if (RExC_whilem_seen > 15)
6193         RExC_whilem_seen = 15;
6194
6195     /* Allocate space and zero-initialize. Note, the two step process
6196        of zeroing when in debug mode, thus anything assigned has to
6197        happen after that */
6198     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6199     r = ReANY(rx);
6200     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6201          char, regexp_internal);
6202     if ( r == NULL || ri == NULL )
6203         FAIL("Regexp out of space");
6204 #ifdef DEBUGGING
6205     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6206     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
6207 #else
6208     /* bulk initialize base fields with 0. */
6209     Zero(ri, sizeof(regexp_internal), char);
6210 #endif
6211
6212     /* non-zero initialization begins here */
6213     RXi_SET( r, ri );
6214     r->engine= eng;
6215     r->extflags = rx_flags;
6216     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6217
6218     if (pm_flags & PMf_IS_QR) {
6219         ri->code_blocks = pRExC_state->code_blocks;
6220         ri->num_code_blocks = pRExC_state->num_code_blocks;
6221     }
6222     else
6223     {
6224         int n;
6225         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6226             if (pRExC_state->code_blocks[n].src_regex)
6227                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6228         SAVEFREEPV(pRExC_state->code_blocks);
6229     }
6230
6231     {
6232         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6233         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
6234
6235         /* The caret is output if there are any defaults: if not all the STD
6236          * flags are set, or if no character set specifier is needed */
6237         bool has_default =
6238                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6239                     || ! has_charset);
6240         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
6241         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6242                             >> RXf_PMf_STD_PMMOD_SHIFT);
6243         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6244         char *p;
6245         /* Allocate for the worst case, which is all the std flags are turned
6246          * on.  If more precision is desired, we could do a population count of
6247          * the flags set.  This could be done with a small lookup table, or by
6248          * shifting, masking and adding, or even, when available, assembly
6249          * language for a machine-language population count.
6250          * We never output a minus, as all those are defaults, so are
6251          * covered by the caret */
6252         const STRLEN wraplen = plen + has_p + has_runon
6253             + has_default       /* If needs a caret */
6254
6255                 /* If needs a character set specifier */
6256             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6257             + (sizeof(STD_PAT_MODS) - 1)
6258             + (sizeof("(?:)") - 1);
6259
6260         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6261         r->xpv_len_u.xpvlenu_pv = p;
6262         if (RExC_utf8)
6263             SvFLAGS(rx) |= SVf_UTF8;
6264         *p++='('; *p++='?';
6265
6266         /* If a default, cover it using the caret */
6267         if (has_default) {
6268             *p++= DEFAULT_PAT_MOD;
6269         }
6270         if (has_charset) {
6271             STRLEN len;
6272             const char* const name = get_regex_charset_name(r->extflags, &len);
6273             Copy(name, p, len, char);
6274             p += len;
6275         }
6276         if (has_p)
6277             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6278         {
6279             char ch;
6280             while((ch = *fptr++)) {
6281                 if(reganch & 1)
6282                     *p++ = ch;
6283                 reganch >>= 1;
6284             }
6285         }
6286
6287         *p++ = ':';
6288         Copy(RExC_precomp, p, plen, char);
6289         assert ((RX_WRAPPED(rx) - p) < 16);
6290         r->pre_prefix = p - RX_WRAPPED(rx);
6291         p += plen;
6292         if (has_runon)
6293             *p++ = '\n';
6294         *p++ = ')';
6295         *p = 0;
6296         SvCUR_set(rx, p - RX_WRAPPED(rx));
6297     }
6298
6299     r->intflags = 0;
6300     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6301
6302     if (RExC_seen & REG_SEEN_RECURSE) {
6303         Newxz(RExC_open_parens, RExC_npar,regnode *);
6304         SAVEFREEPV(RExC_open_parens);
6305         Newxz(RExC_close_parens,RExC_npar,regnode *);
6306         SAVEFREEPV(RExC_close_parens);
6307     }
6308
6309     /* Useful during FAIL. */
6310 #ifdef RE_TRACK_PATTERN_OFFSETS
6311     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6312     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6313                           "%s %"UVuf" bytes for offset annotations.\n",
6314                           ri->u.offsets ? "Got" : "Couldn't get",
6315                           (UV)((2*RExC_size+1) * sizeof(U32))));
6316 #endif
6317     SetProgLen(ri,RExC_size);
6318     RExC_rx_sv = rx;
6319     RExC_rx = r;
6320     RExC_rxi = ri;
6321
6322     /* Second pass: emit code. */
6323     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6324     RExC_pm_flags = pm_flags;
6325     RExC_parse = exp;
6326     RExC_end = exp + plen;
6327     RExC_naughty = 0;
6328     RExC_npar = 1;
6329     RExC_emit_start = ri->program;
6330     RExC_emit = ri->program;
6331     RExC_emit_bound = ri->program + RExC_size + 1;
6332     pRExC_state->code_index = 0;
6333
6334     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6335     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6336         ReREFCNT_dec(rx);
6337         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6338     }
6339     /* XXXX To minimize changes to RE engine we always allocate
6340        3-units-long substrs field. */
6341     Newx(r->substrs, 1, struct reg_substr_data);
6342     if (RExC_recurse_count) {
6343         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6344         SAVEFREEPV(RExC_recurse);
6345     }
6346
6347 reStudy:
6348     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6349     Zero(r->substrs, 1, struct reg_substr_data);
6350
6351 #ifdef TRIE_STUDY_OPT
6352     if (!restudied) {
6353         StructCopy(&zero_scan_data, &data, scan_data_t);
6354         copyRExC_state = RExC_state;
6355     } else {
6356         U32 seen=RExC_seen;
6357         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6358
6359         RExC_state = copyRExC_state;
6360         if (seen & REG_TOP_LEVEL_BRANCHES)
6361             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6362         else
6363             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6364         StructCopy(&zero_scan_data, &data, scan_data_t);
6365     }
6366 #else
6367     StructCopy(&zero_scan_data, &data, scan_data_t);
6368 #endif
6369
6370     /* Dig out information for optimizations. */
6371     r->extflags = RExC_flags; /* was pm_op */
6372     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6373
6374     if (UTF)
6375         SvUTF8_on(rx);  /* Unicode in it? */
6376     ri->regstclass = NULL;
6377     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6378         r->intflags |= PREGf_NAUGHTY;
6379     scan = ri->program + 1;             /* First BRANCH. */
6380
6381     /* testing for BRANCH here tells us whether there is "must appear"
6382        data in the pattern. If there is then we can use it for optimisations */
6383     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6384         SSize_t fake;
6385         STRLEN longest_float_length, longest_fixed_length;
6386         regnode_ssc ch_class; /* pointed to by data */
6387         int stclass_flag;
6388         SSize_t last_close = 0; /* pointed to by data */
6389         regnode *first= scan;
6390         regnode *first_next= regnext(first);
6391         /*
6392          * Skip introductions and multiplicators >= 1
6393          * so that we can extract the 'meat' of the pattern that must
6394          * match in the large if() sequence following.
6395          * NOTE that EXACT is NOT covered here, as it is normally
6396          * picked up by the optimiser separately.
6397          *
6398          * This is unfortunate as the optimiser isnt handling lookahead
6399          * properly currently.
6400          *
6401          */
6402         while ((OP(first) == OPEN && (sawopen = 1)) ||
6403                /* An OR of *one* alternative - should not happen now. */
6404             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6405             /* for now we can't handle lookbehind IFMATCH*/
6406             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6407             (OP(first) == PLUS) ||
6408             (OP(first) == MINMOD) ||
6409                /* An {n,m} with n>0 */
6410             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6411             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6412         {
6413                 /*
6414                  * the only op that could be a regnode is PLUS, all the rest
6415                  * will be regnode_1 or regnode_2.
6416                  *
6417                  * (yves doesn't think this is true)
6418                  */
6419                 if (OP(first) == PLUS)
6420                     sawplus = 1;
6421                 else {
6422                     if (OP(first) == MINMOD)
6423                         sawminmod = 1;
6424                     first += regarglen[OP(first)];
6425                 }
6426                 first = NEXTOPER(first);
6427                 first_next= regnext(first);
6428         }
6429
6430         /* Starting-point info. */
6431       again:
6432         DEBUG_PEEP("first:",first,0);
6433         /* Ignore EXACT as we deal with it later. */
6434         if (PL_regkind[OP(first)] == EXACT) {
6435             if (OP(first) == EXACT)
6436                 NOOP;   /* Empty, get anchored substr later. */
6437             else
6438                 ri->regstclass = first;
6439         }
6440 #ifdef TRIE_STCLASS
6441         else if (PL_regkind[OP(first)] == TRIE &&
6442                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6443         {
6444             regnode *trie_op;
6445             /* this can happen only on restudy */
6446             if ( OP(first) == TRIE ) {
6447                 struct regnode_1 *trieop = (struct regnode_1 *)
6448                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6449                 StructCopy(first,trieop,struct regnode_1);
6450                 trie_op=(regnode *)trieop;
6451             } else {
6452                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6453                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6454                 StructCopy(first,trieop,struct regnode_charclass);
6455                 trie_op=(regnode *)trieop;
6456             }
6457             OP(trie_op)+=2;
6458             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6459             ri->regstclass = trie_op;
6460         }
6461 #endif
6462         else if (REGNODE_SIMPLE(OP(first)))
6463             ri->regstclass = first;
6464         else if (PL_regkind[OP(first)] == BOUND ||
6465                  PL_regkind[OP(first)] == NBOUND)
6466             ri->regstclass = first;
6467         else if (PL_regkind[OP(first)] == BOL) {
6468             r->extflags |= (OP(first) == MBOL
6469                            ? RXf_ANCH_MBOL
6470                            : (OP(first) == SBOL
6471                               ? RXf_ANCH_SBOL
6472                               : RXf_ANCH_BOL));
6473             first = NEXTOPER(first);
6474             goto again;
6475         }
6476         else if (OP(first) == GPOS) {
6477             r->extflags |= RXf_ANCH_GPOS;
6478             first = NEXTOPER(first);
6479             goto again;
6480         }
6481         else if ((!sawopen || !RExC_sawback) &&
6482             (OP(first) == STAR &&
6483             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6484             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6485         {
6486             /* turn .* into ^.* with an implied $*=1 */
6487             const int type =
6488                 (OP(NEXTOPER(first)) == REG_ANY)
6489                     ? RXf_ANCH_MBOL
6490                     : RXf_ANCH_SBOL;
6491             r->extflags |= type;
6492             r->intflags |= PREGf_IMPLICIT;
6493             first = NEXTOPER(first);
6494             goto again;
6495         }
6496         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6497             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6498             /* x+ must match at the 1st pos of run of x's */
6499             r->intflags |= PREGf_SKIP;
6500
6501         /* Scan is after the zeroth branch, first is atomic matcher. */
6502 #ifdef TRIE_STUDY_OPT
6503         DEBUG_PARSE_r(
6504             if (!restudied)
6505                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6506                               (IV)(first - scan + 1))
6507         );
6508 #else
6509         DEBUG_PARSE_r(
6510             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6511                 (IV)(first - scan + 1))
6512         );
6513 #endif
6514
6515
6516         /*
6517         * If there's something expensive in the r.e., find the
6518         * longest literal string that must appear and make it the
6519         * regmust.  Resolve ties in favor of later strings, since
6520         * the regstart check works with the beginning of the r.e.
6521         * and avoiding duplication strengthens checking.  Not a
6522         * strong reason, but sufficient in the absence of others.
6523         * [Now we resolve ties in favor of the earlier string if
6524         * it happens that c_offset_min has been invalidated, since the
6525         * earlier string may buy us something the later one won't.]
6526         */
6527
6528         data.longest_fixed = newSVpvs("");
6529         data.longest_float = newSVpvs("");
6530         data.last_found = newSVpvs("");
6531         data.longest = &(data.longest_fixed);
6532         ENTER_with_name("study_chunk");
6533         SAVEFREESV(data.longest_fixed);
6534         SAVEFREESV(data.longest_float);
6535         SAVEFREESV(data.last_found);
6536         first = scan;
6537         if (!ri->regstclass) {
6538             ssc_init(pRExC_state, &ch_class);
6539             data.start_class = &ch_class;
6540             stclass_flag = SCF_DO_STCLASS_AND;
6541         } else                          /* XXXX Check for BOUND? */
6542             stclass_flag = 0;
6543         data.last_closep = &last_close;
6544
6545         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6546             &data, -1, NULL, NULL,
6547             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6548                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6549             0);
6550
6551
6552         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6553
6554
6555         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6556              && data.last_start_min == 0 && data.last_end > 0
6557              && !RExC_seen_zerolen
6558              && !(RExC_seen & REG_SEEN_VERBARG)
6559              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6560             r->extflags |= RXf_CHECK_ALL;
6561         scan_commit(pRExC_state, &data,&minlen,0);
6562
6563         longest_float_length = CHR_SVLEN(data.longest_float);
6564
6565         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6566                    && data.offset_fixed == data.offset_float_min
6567                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6568             && S_setup_longest (aTHX_ pRExC_state,
6569                                     data.longest_float,
6570                                     &(r->float_utf8),
6571                                     &(r->float_substr),
6572                                     &(r->float_end_shift),
6573                                     data.lookbehind_float,
6574                                     data.offset_float_min,
6575                                     data.minlen_float,
6576                                     longest_float_length,
6577                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6578                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6579         {
6580             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6581             r->float_max_offset = data.offset_float_max;
6582             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6583                 r->float_max_offset -= data.lookbehind_float;
6584             SvREFCNT_inc_simple_void_NN(data.longest_float);
6585         }
6586         else {
6587             r->float_substr = r->float_utf8 = NULL;
6588             longest_float_length = 0;
6589         }
6590
6591         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6592
6593         if (S_setup_longest (aTHX_ pRExC_state,
6594                                 data.longest_fixed,
6595                                 &(r->anchored_utf8),
6596                                 &(r->anchored_substr),
6597                                 &(r->anchored_end_shift),
6598                                 data.lookbehind_fixed,
6599                                 data.offset_fixed,
6600                                 data.minlen_fixed,
6601                                 longest_fixed_length,
6602                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6603                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6604         {
6605             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6606             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6607         }
6608         else {
6609             r->anchored_substr = r->anchored_utf8 = NULL;
6610             longest_fixed_length = 0;
6611         }
6612         LEAVE_with_name("study_chunk");
6613
6614         if (ri->regstclass
6615             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6616             ri->regstclass = NULL;
6617
6618         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6619             && stclass_flag
6620             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6621             && !ssc_is_anything(data.start_class))
6622         {
6623             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6624
6625             ssc_finalize(pRExC_state, data.start_class);
6626
6627             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6628             StructCopy(data.start_class,
6629                        (regnode_ssc*)RExC_rxi->data->data[n],
6630                        regnode_ssc);
6631             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6632             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6633             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6634                       regprop(r, sv, (regnode*)data.start_class);
6635                       PerlIO_printf(Perl_debug_log,
6636                                     "synthetic stclass \"%s\".\n",
6637                                     SvPVX_const(sv));});
6638             data.start_class = NULL;
6639         }
6640
6641         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6642         if (longest_fixed_length > longest_float_length) {
6643             r->check_end_shift = r->anchored_end_shift;
6644             r->check_substr = r->anchored_substr;
6645             r->check_utf8 = r->anchored_utf8;
6646             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6647             if (r->extflags & RXf_ANCH_SINGLE)
6648                 r->extflags |= RXf_NOSCAN;
6649         }
6650         else {
6651             r->check_end_shift = r->float_end_shift;
6652             r->check_substr = r->float_substr;
6653             r->check_utf8 = r->float_utf8;
6654             r->check_offset_min = r->float_min_offset;
6655             r->check_offset_max = r->float_max_offset;
6656         }
6657         if ((r->check_substr || r->check_utf8) ) {
6658             r->extflags |= RXf_USE_INTUIT;
6659             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6660                 r->extflags |= RXf_INTUIT_TAIL;
6661         }
6662         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6663         if ( (STRLEN)minlen < longest_float_length )
6664             minlen= longest_float_length;
6665         if ( (STRLEN)minlen < longest_fixed_length )
6666             minlen= longest_fixed_length;
6667         */
6668     }
6669     else {
6670         /* Several toplevels. Best we can is to set minlen. */
6671         SSize_t fake;
6672         regnode_ssc ch_class;
6673         SSize_t last_close = 0;
6674
6675         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6676
6677         scan = ri->program + 1;
6678         ssc_init(pRExC_state, &ch_class);
6679         data.start_class = &ch_class;
6680         data.last_closep = &last_close;
6681
6682
6683         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6684             &data, -1, NULL, NULL,
6685             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6686                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6687             0);
6688
6689         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6690
6691         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6692                 = r->float_substr = r->float_utf8 = NULL;
6693
6694         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6695             && ! ssc_is_anything(data.start_class))
6696         {
6697             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6698
6699             ssc_finalize(pRExC_state, data.start_class);
6700
6701             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6702             StructCopy(data.start_class,
6703                        (regnode_ssc*)RExC_rxi->data->data[n],
6704                        regnode_ssc);
6705             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6706             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6707             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6708                       regprop(r, sv, (regnode*)data.start_class);
6709                       PerlIO_printf(Perl_debug_log,
6710                                     "synthetic stclass \"%s\".\n",
6711                                     SvPVX_const(sv));});
6712             data.start_class = NULL;
6713         }
6714     }
6715
6716     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6717        the "real" pattern. */
6718     DEBUG_OPTIMISE_r({
6719         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6720                       (IV)minlen, (IV)r->minlen);
6721     });
6722     r->minlenret = minlen;
6723     if (r->minlen < minlen)
6724         r->minlen = minlen;
6725
6726     if (RExC_seen & REG_SEEN_GPOS)
6727         r->extflags |= RXf_GPOS_SEEN;
6728     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6729         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6730     if (pRExC_state->num_code_blocks)
6731         r->extflags |= RXf_EVAL_SEEN;
6732     if (RExC_seen & REG_SEEN_CANY)
6733         r->extflags |= RXf_CANY_SEEN;
6734     if (RExC_seen & REG_SEEN_VERBARG)
6735     {
6736         r->intflags |= PREGf_VERBARG_SEEN;
6737         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6738     }
6739     if (RExC_seen & REG_SEEN_CUTGROUP)
6740         r->intflags |= PREGf_CUTGROUP_SEEN;
6741     if (pm_flags & PMf_USE_RE_EVAL)
6742         r->intflags |= PREGf_USE_RE_EVAL;
6743     if (RExC_paren_names)
6744         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6745     else
6746         RXp_PAREN_NAMES(r) = NULL;
6747
6748     {
6749         regnode *first = ri->program + 1;
6750         U8 fop = OP(first);
6751         regnode *next = NEXTOPER(first);
6752         U8 nop = OP(next);
6753
6754         if (PL_regkind[fop] == NOTHING && nop == END)
6755             r->extflags |= RXf_NULL;
6756         else if (PL_regkind[fop] == BOL && nop == END)
6757             r->extflags |= RXf_START_ONLY;
6758         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6759             r->extflags |= RXf_WHITE;
6760         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6761             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6762
6763     }
6764 #ifdef DEBUGGING
6765     if (RExC_paren_names) {
6766         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
6767         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6768     } else
6769 #endif
6770         ri->name_list_idx = 0;
6771
6772     if (RExC_recurse_count) {
6773         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6774             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6775             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6776         }
6777     }
6778     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6779     /* assume we don't need to swap parens around before we match */
6780
6781     DEBUG_DUMP_r({
6782         PerlIO_printf(Perl_debug_log,"Final program:\n");
6783         regdump(r);
6784     });
6785 #ifdef RE_TRACK_PATTERN_OFFSETS
6786     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6787         const STRLEN len = ri->u.offsets[0];
6788         STRLEN i;
6789         GET_RE_DEBUG_FLAGS_DECL;
6790         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6791         for (i = 1; i <= len; i++) {
6792             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6793                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6794                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6795             }
6796         PerlIO_printf(Perl_debug_log, "\n");
6797     });
6798 #endif
6799
6800 #ifdef USE_ITHREADS
6801     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6802      * by setting the regexp SV to readonly-only instead. If the
6803      * pattern's been recompiled, the USEDness should remain. */
6804     if (old_re && SvREADONLY(old_re))
6805         SvREADONLY_on(rx);
6806 #endif
6807     return rx;
6808 }
6809
6810
6811 SV*
6812 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6813                     const U32 flags)
6814 {
6815     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6816
6817     PERL_UNUSED_ARG(value);
6818
6819     if (flags & RXapif_FETCH) {
6820         return reg_named_buff_fetch(rx, key, flags);
6821     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6822         Perl_croak_no_modify();
6823         return NULL;
6824     } else if (flags & RXapif_EXISTS) {
6825         return reg_named_buff_exists(rx, key, flags)
6826             ? &PL_sv_yes
6827             : &PL_sv_no;
6828     } else if (flags & RXapif_REGNAMES) {
6829         return reg_named_buff_all(rx, flags);
6830     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6831         return reg_named_buff_scalar(rx, flags);
6832     } else {
6833         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6834         return NULL;
6835     }
6836 }
6837
6838 SV*
6839 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6840                          const U32 flags)
6841 {
6842     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6843     PERL_UNUSED_ARG(lastkey);
6844
6845     if (flags & RXapif_FIRSTKEY)
6846         return reg_named_buff_firstkey(rx, flags);
6847     else if (flags & RXapif_NEXTKEY)
6848         return reg_named_buff_nextkey(rx, flags);
6849     else {
6850         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6851         return NULL;
6852     }
6853 }
6854
6855 SV*
6856 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6857                           const U32 flags)
6858 {
6859     AV *retarray = NULL;
6860     SV *ret;
6861     struct regexp *const rx = ReANY(r);
6862
6863     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6864
6865     if (flags & RXapif_ALL)
6866         retarray=newAV();
6867
6868     if (rx && RXp_PAREN_NAMES(rx)) {
6869         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6870         if (he_str) {
6871             IV i;
6872             SV* sv_dat=HeVAL(he_str);
6873             I32 *nums=(I32*)SvPVX(sv_dat);
6874             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6875                 if ((I32)(rx->nparens) >= nums[i]
6876                     && rx->offs[nums[i]].start != -1
6877                     && rx->offs[nums[i]].end != -1)
6878                 {
6879                     ret = newSVpvs("");
6880                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6881                     if (!retarray)
6882                         return ret;
6883                 } else {
6884                     if (retarray)
6885                         ret = newSVsv(&PL_sv_undef);
6886                 }
6887                 if (retarray)
6888                     av_push(retarray, ret);
6889             }
6890             if (retarray)
6891                 return newRV_noinc(MUTABLE_SV(retarray));
6892         }
6893     }
6894     return NULL;
6895 }
6896
6897 bool
6898 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6899                            const U32 flags)
6900 {
6901     struct regexp *const rx = ReANY(r);
6902
6903     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6904
6905     if (rx && RXp_PAREN_NAMES(rx)) {
6906         if (flags & RXapif_ALL) {
6907             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6908         } else {
6909             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6910             if (sv) {
6911                 SvREFCNT_dec_NN(sv);
6912                 return TRUE;
6913             } else {
6914                 return FALSE;
6915             }
6916         }
6917     } else {
6918         return FALSE;
6919     }
6920 }
6921
6922 SV*
6923 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6924 {
6925     struct regexp *const rx = ReANY(r);
6926
6927     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6928
6929     if ( rx && RXp_PAREN_NAMES(rx) ) {
6930         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6931
6932         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6933     } else {
6934         return FALSE;
6935     }
6936 }
6937
6938 SV*
6939 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6940 {
6941     struct regexp *const rx = ReANY(r);
6942     GET_RE_DEBUG_FLAGS_DECL;
6943
6944     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6945
6946     if (rx && RXp_PAREN_NAMES(rx)) {
6947         HV *hv = RXp_PAREN_NAMES(rx);
6948         HE *temphe;
6949         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6950             IV i;
6951             IV parno = 0;
6952             SV* sv_dat = HeVAL(temphe);
6953             I32 *nums = (I32*)SvPVX(sv_dat);
6954             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6955                 if ((I32)(rx->lastparen) >= nums[i] &&
6956                     rx->offs[nums[i]].start != -1 &&
6957                     rx->offs[nums[i]].end != -1)
6958                 {
6959                     parno = nums[i];
6960                     break;
6961                 }
6962             }
6963             if (parno || flags & RXapif_ALL) {
6964                 return newSVhek(HeKEY_hek(temphe));
6965             }
6966         }
6967     }
6968     return NULL;
6969 }
6970
6971 SV*
6972 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6973 {
6974     SV *ret;
6975     AV *av;
6976     SSize_t length;
6977     struct regexp *const rx = ReANY(r);
6978
6979     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6980
6981     if (rx && RXp_PAREN_NAMES(rx)) {
6982         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6983             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6984         } else if (flags & RXapif_ONE) {
6985             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6986             av = MUTABLE_AV(SvRV(ret));
6987             length = av_len(av);
6988             SvREFCNT_dec_NN(ret);
6989             return newSViv(length + 1);
6990         } else {
6991             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6992             return NULL;
6993         }
6994     }
6995     return &PL_sv_undef;
6996 }
6997
6998 SV*
6999 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7000 {
7001     struct regexp *const rx = ReANY(r);
7002     AV *av = newAV();
7003
7004     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7005
7006     if (rx && RXp_PAREN_NAMES(rx)) {
7007         HV *hv= RXp_PAREN_NAMES(rx);
7008         HE *temphe;
7009         (void)hv_iterinit(hv);
7010         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7011             IV i;
7012             IV parno = 0;
7013             SV* sv_dat = HeVAL(temphe);
7014             I32 *nums = (I32*)SvPVX(sv_dat);
7015             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7016                 if ((I32)(rx->lastparen) >= nums[i] &&
7017                     rx->offs[nums[i]].start != -1 &&
7018                     rx->offs[nums[i]].end != -1)
7019                 {
7020                     parno = nums[i];
7021                     break;
7022                 }
7023             }
7024             if (parno || flags & RXapif_ALL) {
7025                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7026             }
7027         }
7028     }
7029
7030     return newRV_noinc(MUTABLE_SV(av));
7031 }
7032
7033 void
7034 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7035                              SV * const sv)
7036 {
7037     struct regexp *const rx = ReANY(r);
7038     char *s = NULL;
7039     SSize_t i = 0;
7040     SSize_t s1, t1;
7041     I32 n = paren;
7042
7043     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7044
7045     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7046            || n == RX_BUFF_IDX_CARET_FULLMATCH
7047            || n == RX_BUFF_IDX_CARET_POSTMATCH
7048        )
7049     {
7050         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7051         if (!keepcopy) {
7052             /* on something like
7053              *    $r = qr/.../;
7054              *    /$qr/p;
7055              * the KEEPCOPY is set on the PMOP rather than the regex */
7056             if (PL_curpm && r == PM_GETRE(PL_curpm))
7057                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7058         }
7059         if (!keepcopy)
7060             goto ret_undef;
7061     }
7062
7063     if (!rx->subbeg)
7064         goto ret_undef;
7065
7066     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7067         /* no need to distinguish between them any more */
7068         n = RX_BUFF_IDX_FULLMATCH;
7069
7070     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7071         && rx->offs[0].start != -1)
7072     {
7073         /* $`, ${^PREMATCH} */
7074         i = rx->offs[0].start;
7075         s = rx->subbeg;
7076     }
7077     else
7078     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7079         && rx->offs[0].end != -1)
7080     {
7081         /* $', ${^POSTMATCH} */
7082         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7083         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7084     }
7085     else
7086     if ( 0 <= n && n <= (I32)rx->nparens &&
7087         (s1 = rx->offs[n].start) != -1 &&
7088         (t1 = rx->offs[n].end) != -1)
7089     {
7090         /* $&, ${^MATCH},  $1 ... */
7091         i = t1 - s1;
7092         s = rx->subbeg + s1 - rx->suboffset;
7093     } else {
7094         goto ret_undef;
7095     }
7096
7097     assert(s >= rx->subbeg);
7098     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7099     if (i >= 0) {
7100 #if NO_TAINT_SUPPORT
7101         sv_setpvn(sv, s, i);
7102 #else
7103         const int oldtainted = TAINT_get;
7104         TAINT_NOT;
7105         sv_setpvn(sv, s, i);
7106         TAINT_set(oldtainted);
7107 #endif
7108         if ( (rx->extflags & RXf_CANY_SEEN)
7109             ? (RXp_MATCH_UTF8(rx)
7110                         && (!i || is_utf8_string((U8*)s, i)))
7111             : (RXp_MATCH_UTF8(rx)) )
7112         {
7113             SvUTF8_on(sv);
7114         }
7115         else
7116             SvUTF8_off(sv);
7117         if (TAINTING_get) {
7118             if (RXp_MATCH_TAINTED(rx)) {
7119                 if (SvTYPE(sv) >= SVt_PVMG) {
7120                     MAGIC* const mg = SvMAGIC(sv);
7121                     MAGIC* mgt;
7122                     TAINT;
7123                     SvMAGIC_set(sv, mg->mg_moremagic);
7124                     SvTAINT(sv);
7125                     if ((mgt = SvMAGIC(sv))) {
7126                         mg->mg_moremagic = mgt;
7127                         SvMAGIC_set(sv, mg);
7128                     }
7129                 } else {
7130                     TAINT;
7131                     SvTAINT(sv);
7132                 }
7133             } else
7134                 SvTAINTED_off(sv);
7135         }
7136     } else {
7137       ret_undef:
7138         sv_setsv(sv,&PL_sv_undef);
7139         return;
7140     }
7141 }
7142
7143 void
7144 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7145                                                          SV const * const value)
7146 {
7147     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7148
7149     PERL_UNUSED_ARG(rx);
7150     PERL_UNUSED_ARG(paren);
7151     PERL_UNUSED_ARG(value);
7152
7153     if (!PL_localizing)
7154         Perl_croak_no_modify();
7155 }
7156
7157 I32
7158 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7159                               const I32 paren)
7160 {
7161     struct regexp *const rx = ReANY(r);
7162     I32 i;
7163     I32 s1, t1;
7164
7165     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7166
7167     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7168         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7169         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7170     )
7171     {
7172         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7173         if (!keepcopy) {
7174             /* on something like
7175              *    $r = qr/.../;
7176              *    /$qr/p;
7177              * the KEEPCOPY is set on the PMOP rather than the regex */
7178             if (PL_curpm && r == PM_GETRE(PL_curpm))
7179                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7180         }
7181         if (!keepcopy)
7182             goto warn_undef;
7183     }
7184
7185     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7186     switch (paren) {
7187       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7188       case RX_BUFF_IDX_PREMATCH:       /* $` */
7189         if (rx->offs[0].start != -1) {
7190                         i = rx->offs[0].start;
7191                         if (i > 0) {
7192                                 s1 = 0;
7193                                 t1 = i;
7194                                 goto getlen;
7195                         }
7196             }
7197         return 0;
7198
7199       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7200       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7201             if (rx->offs[0].end != -1) {
7202                         i = rx->sublen - rx->offs[0].end;
7203                         if (i > 0) {
7204                                 s1 = rx->offs[0].end;
7205                                 t1 = rx->sublen;
7206                                 goto getlen;
7207                         }
7208             }
7209         return 0;
7210
7211       default: /* $& / ${^MATCH}, $1, $2, ... */
7212             if (paren <= (I32)rx->nparens &&
7213             (s1 = rx->offs[paren].start) != -1 &&
7214             (t1 = rx->offs[paren].end) != -1)
7215             {
7216             i = t1 - s1;
7217             goto getlen;
7218         } else {
7219           warn_undef:
7220             if (ckWARN(WARN_UNINITIALIZED))
7221                 report_uninit((const SV *)sv);
7222             return 0;
7223         }
7224     }
7225   getlen:
7226     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7227         const char * const s = rx->subbeg - rx->suboffset + s1;
7228         const U8 *ep;
7229         STRLEN el;
7230
7231         i = t1 - s1;
7232         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7233                         i = el;
7234     }
7235     return i;
7236 }
7237
7238 SV*
7239 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7240 {
7241     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7242         PERL_UNUSED_ARG(rx);
7243         if (0)
7244             return NULL;
7245         else
7246             return newSVpvs("Regexp");
7247 }
7248
7249 /* Scans the name of a named buffer from the pattern.
7250  * If flags is REG_RSN_RETURN_NULL returns null.
7251  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7252  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7253  * to the parsed name as looked up in the RExC_paren_names hash.
7254  * If there is an error throws a vFAIL().. type exception.
7255  */
7256
7257 #define REG_RSN_RETURN_NULL    0
7258 #define REG_RSN_RETURN_NAME    1
7259 #define REG_RSN_RETURN_DATA    2
7260
7261 STATIC SV*
7262 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7263 {
7264     char *name_start = RExC_parse;
7265
7266     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7267
7268     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7269          /* skip IDFIRST by using do...while */
7270         if (UTF)
7271             do {
7272                 RExC_parse += UTF8SKIP(RExC_parse);
7273             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7274         else
7275             do {
7276                 RExC_parse++;
7277             } while (isWORDCHAR(*RExC_parse));
7278     } else {
7279         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7280         vFAIL("Group name must start with a non-digit word character");
7281     }
7282     if ( flags ) {
7283         SV* sv_name
7284             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7285                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7286         if ( flags == REG_RSN_RETURN_NAME)
7287             return sv_name;
7288         else if (flags==REG_RSN_RETURN_DATA) {
7289             HE *he_str = NULL;
7290             SV *sv_dat = NULL;
7291             if ( ! sv_name )      /* should not happen*/
7292                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7293             if (RExC_paren_names)
7294                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7295             if ( he_str )
7296                 sv_dat = HeVAL(he_str);
7297             if ( ! sv_dat )
7298                 vFAIL("Reference to nonexistent named group");
7299             return sv_dat;
7300         }
7301         else {
7302             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7303                        (unsigned long) flags);
7304         }
7305         assert(0); /* NOT REACHED */
7306     }
7307     return NULL;
7308 }
7309
7310 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7311     int rem=(int)(RExC_end - RExC_parse);                       \
7312     int cut;                                                    \
7313     int num;                                                    \
7314     int iscut=0;                                                \
7315     if (rem>10) {                                               \
7316         rem=10;                                                 \
7317         iscut=1;                                                \
7318     }                                                           \
7319     cut=10-rem;                                                 \
7320     if (RExC_lastparse!=RExC_parse)                             \
7321         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7322             rem, RExC_parse,                                    \
7323             cut + 4,                                            \
7324             iscut ? "..." : "<"                                 \
7325         );                                                      \
7326     else                                                        \
7327         PerlIO_printf(Perl_debug_log,"%16s","");                \
7328                                                                 \
7329     if (SIZE_ONLY)                                              \
7330        num = RExC_size + 1;                                     \
7331     else                                                        \
7332        num=REG_NODE_NUM(RExC_emit);                             \
7333     if (RExC_lastnum!=num)                                      \
7334        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7335     else                                                        \
7336        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7337     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7338         (int)((depth*2)), "",                                   \
7339         (funcname)                                              \
7340     );                                                          \
7341     RExC_lastnum=num;                                           \
7342     RExC_lastparse=RExC_parse;                                  \
7343 })
7344
7345
7346
7347 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7348     DEBUG_PARSE_MSG((funcname));                            \
7349     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7350 })
7351 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7352     DEBUG_PARSE_MSG((funcname));                            \
7353     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7354 })
7355
7356 /* This section of code defines the inversion list object and its methods.  The
7357  * interfaces are highly subject to change, so as much as possible is static to
7358  * this file.  An inversion list is here implemented as a malloc'd C UV array
7359  * as an SVt_INVLIST scalar.
7360  *
7361  * An inversion list for Unicode is an array of code points, sorted by ordinal
7362  * number.  The zeroth element is the first code point in the list.  The 1th
7363  * element is the first element beyond that not in the list.  In other words,
7364  * the first range is
7365  *  invlist[0]..(invlist[1]-1)
7366  * The other ranges follow.  Thus every element whose index is divisible by two
7367  * marks the beginning of a range that is in the list, and every element not
7368  * divisible by two marks the beginning of a range not in the list.  A single
7369  * element inversion list that contains the single code point N generally
7370  * consists of two elements
7371  *  invlist[0] == N
7372  *  invlist[1] == N+1
7373  * (The exception is when N is the highest representable value on the
7374  * machine, in which case the list containing just it would be a single
7375  * element, itself.  By extension, if the last range in the list extends to
7376  * infinity, then the first element of that range will be in the inversion list
7377  * at a position that is divisible by two, and is the final element in the
7378  * list.)
7379  * Taking the complement (inverting) an inversion list is quite simple, if the
7380  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7381  * This implementation reserves an element at the beginning of each inversion
7382  * list to always contain 0; there is an additional flag in the header which
7383  * indicates if the list begins at the 0, or is offset to begin at the next
7384  * element.
7385  *
7386  * More about inversion lists can be found in "Unicode Demystified"
7387  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7388  * More will be coming when functionality is added later.
7389  *
7390  * The inversion list data structure is currently implemented as an SV pointing
7391  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7392  * array of UV whose memory management is automatically handled by the existing
7393  * facilities for SV's.
7394  *
7395  * Some of the methods should always be private to the implementation, and some
7396  * should eventually be made public */
7397
7398 /* The header definitions are in F<inline_invlist.c> */
7399
7400 PERL_STATIC_INLINE UV*
7401 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7402 {
7403     /* Returns a pointer to the first element in the inversion list's array.
7404      * This is called upon initialization of an inversion list.  Where the
7405      * array begins depends on whether the list has the code point U+0000 in it
7406      * or not.  The other parameter tells it whether the code that follows this
7407      * call is about to put a 0 in the inversion list or not.  The first
7408      * element is either the element reserved for 0, if TRUE, or the element
7409      * after it, if FALSE */
7410
7411     bool* offset = get_invlist_offset_addr(invlist);
7412     UV* zero_addr = (UV *) SvPVX(invlist);
7413
7414     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7415
7416     /* Must be empty */
7417     assert(! _invlist_len(invlist));
7418
7419     *zero_addr = 0;
7420
7421     /* 1^1 = 0; 1^0 = 1 */
7422     *offset = 1 ^ will_have_0;
7423     return zero_addr + *offset;
7424 }
7425
7426 PERL_STATIC_INLINE UV*
7427 S_invlist_array(pTHX_ SV* const invlist)
7428 {
7429     /* Returns the pointer to the inversion list's array.  Every time the
7430      * length changes, this needs to be called in case malloc or realloc moved
7431      * it */
7432
7433     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7434
7435     /* Must not be empty.  If these fail, you probably didn't check for <len>
7436      * being non-zero before trying to get the array */
7437     assert(_invlist_len(invlist));
7438
7439     /* The very first element always contains zero, The array begins either
7440      * there, or if the inversion list is offset, at the element after it.
7441      * The offset header field determines which; it contains 0 or 1 to indicate
7442      * how much additionally to add */
7443     assert(0 == *(SvPVX(invlist)));
7444     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7445 }
7446
7447 PERL_STATIC_INLINE void
7448 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7449 {
7450     /* Sets the current number of elements stored in the inversion list.
7451      * Updates SvCUR correspondingly */
7452
7453     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7454
7455     assert(SvTYPE(invlist) == SVt_INVLIST);
7456
7457     SvCUR_set(invlist,
7458               (len == 0)
7459                ? 0
7460                : TO_INTERNAL_SIZE(len + offset));
7461     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7462 }
7463
7464 PERL_STATIC_INLINE IV*
7465 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7466 {
7467     /* Return the address of the IV that is reserved to hold the cached index
7468      * */
7469
7470     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7471
7472     assert(SvTYPE(invlist) == SVt_INVLIST);
7473
7474     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7475 }
7476
7477 PERL_STATIC_INLINE IV
7478 S_invlist_previous_index(pTHX_ SV* const invlist)
7479 {
7480     /* Returns cached index of previous search */
7481
7482     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7483
7484     return *get_invlist_previous_index_addr(invlist);
7485 }
7486
7487 PERL_STATIC_INLINE void
7488 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7489 {
7490     /* Caches <index> for later retrieval */
7491
7492     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7493
7494     assert(index == 0 || index < (int) _invlist_len(invlist));
7495
7496     *get_invlist_previous_index_addr(invlist) = index;
7497 }
7498
7499 PERL_STATIC_INLINE UV
7500 S_invlist_max(pTHX_ SV* const invlist)
7501 {
7502     /* Returns the maximum number of elements storable in the inversion list's
7503      * array, without having to realloc() */
7504
7505     PERL_ARGS_ASSERT_INVLIST_MAX;
7506
7507     assert(SvTYPE(invlist) == SVt_INVLIST);
7508
7509     /* Assumes worst case, in which the 0 element is not counted in the
7510      * inversion list, so subtracts 1 for that */
7511     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7512            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7513            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7514 }
7515
7516 #ifndef PERL_IN_XSUB_RE
7517 SV*
7518 Perl__new_invlist(pTHX_ IV initial_size)
7519 {
7520
7521     /* Return a pointer to a newly constructed inversion list, with enough
7522      * space to store 'initial_size' elements.  If that number is negative, a
7523      * system default is used instead */
7524
7525     SV* new_list;
7526
7527     if (initial_size < 0) {
7528         initial_size = 10;
7529     }
7530
7531     /* Allocate the initial space */
7532     new_list = newSV_type(SVt_INVLIST);
7533
7534     /* First 1 is in case the zero element isn't in the list; second 1 is for
7535      * trailing NUL */
7536     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7537     invlist_set_len(new_list, 0, 0);
7538
7539     /* Force iterinit() to be used to get iteration to work */
7540     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7541
7542     *get_invlist_previous_index_addr(new_list) = 0;
7543
7544     return new_list;
7545 }
7546 #endif
7547
7548 STATIC SV*
7549 S__new_invlist_C_array(pTHX_ const UV* const list)
7550 {
7551     /* Return a pointer to a newly constructed inversion list, initialized to
7552      * point to <list>, which has to be in the exact correct inversion list
7553      * form, including internal fields.  Thus this is a dangerous routine that
7554      * should not be used in the wrong hands.  The passed in 'list' contains
7555      * several header fields at the beginning that are not part of the
7556      * inversion list body proper */
7557
7558     const STRLEN length = (STRLEN) list[0];
7559     const UV version_id =          list[1];
7560     const bool offset   =    cBOOL(list[2]);
7561 #define HEADER_LENGTH 3
7562     /* If any of the above changes in any way, you must change HEADER_LENGTH
7563      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7564      *      perl -E 'say int(rand 2**31-1)'
7565      */
7566 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7567                                         data structure type, so that one being
7568                                         passed in can be validated to be an
7569                                         inversion list of the correct vintage.
7570                                        */
7571
7572     SV* invlist = newSV_type(SVt_INVLIST);
7573
7574     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7575
7576     if (version_id != INVLIST_VERSION_ID) {
7577         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7578     }
7579
7580     /* The generated array passed in includes header elements that aren't part
7581      * of the list proper, so start it just after them */
7582     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7583
7584     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7585                                shouldn't touch it */
7586
7587     *(get_invlist_offset_addr(invlist)) = offset;
7588
7589     /* The 'length' passed to us is the physical number of elements in the
7590      * inversion list.  But if there is an offset the logical number is one
7591      * less than that */
7592     invlist_set_len(invlist, length  - offset, offset);
7593
7594     invlist_set_previous_index(invlist, 0);
7595
7596     /* Initialize the iteration pointer. */
7597     invlist_iterfinish(invlist);
7598
7599     return invlist;
7600 }
7601
7602 STATIC void
7603 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7604 {
7605     /* Grow the maximum size of an inversion list */
7606
7607     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7608
7609     assert(SvTYPE(invlist) == SVt_INVLIST);
7610
7611     /* Add one to account for the zero element at the beginning which may not
7612      * be counted by the calling parameters */
7613     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7614 }
7615
7616 PERL_STATIC_INLINE void
7617 S_invlist_trim(pTHX_ SV* const invlist)
7618 {
7619     PERL_ARGS_ASSERT_INVLIST_TRIM;
7620
7621     assert(SvTYPE(invlist) == SVt_INVLIST);
7622
7623     /* Change the length of the inversion list to how many entries it currently
7624      * has */
7625     SvPV_shrink_to_cur((SV *) invlist);
7626 }
7627
7628 STATIC void
7629 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7630 {
7631    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7632     * the end of the inversion list.  The range must be above any existing
7633     * ones. */
7634
7635     UV* array;
7636     UV max = invlist_max(invlist);
7637     UV len = _invlist_len(invlist);
7638     bool offset;
7639
7640     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7641
7642     if (len == 0) { /* Empty lists must be initialized */
7643         offset = start != 0;
7644         array = _invlist_array_init(invlist, ! offset);
7645     }
7646     else {
7647         /* Here, the existing list is non-empty. The current max entry in the
7648          * list is generally the first value not in the set, except when the
7649          * set extends to the end of permissible values, in which case it is
7650          * the first entry in that final set, and so this call is an attempt to
7651          * append out-of-order */
7652
7653         UV final_element = len - 1;
7654         array = invlist_array(invlist);
7655         if (array[final_element] > start
7656             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7657         {
7658             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7659                        array[final_element], start,
7660                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7661         }
7662
7663         /* Here, it is a legal append.  If the new range begins with the first
7664          * value not in the set, it is extending the set, so the new first
7665          * value not in the set is one greater than the newly extended range.
7666          * */
7667         offset = *get_invlist_offset_addr(invlist);
7668         if (array[final_element] == start) {
7669             if (end != UV_MAX) {
7670                 array[final_element] = end + 1;
7671             }
7672             else {
7673                 /* But if the end is the maximum representable on the machine,
7674                  * just let the range that this would extend to have no end */
7675                 invlist_set_len(invlist, len - 1, offset);
7676             }
7677             return;
7678         }
7679     }
7680
7681     /* Here the new range doesn't extend any existing set.  Add it */
7682
7683     len += 2;   /* Includes an element each for the start and end of range */
7684
7685     /* If wll overflow the existing space, extend, which may cause the array to
7686      * be moved */
7687     if (max < len) {
7688         invlist_extend(invlist, len);
7689
7690         /* Have to set len here to avoid assert failure in invlist_array() */
7691         invlist_set_len(invlist, len, offset);
7692
7693         array = invlist_array(invlist);
7694     }
7695     else {
7696         invlist_set_len(invlist, len, offset);
7697     }
7698
7699     /* The next item on the list starts the range, the one after that is
7700      * one past the new range.  */
7701     array[len - 2] = start;
7702     if (end != UV_MAX) {
7703         array[len - 1] = end + 1;
7704     }
7705     else {
7706         /* But if the end is the maximum representable on the machine, just let
7707          * the range have no end */
7708         invlist_set_len(invlist, len - 1, offset);
7709     }
7710 }
7711
7712 #ifndef PERL_IN_XSUB_RE
7713
7714 IV
7715 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7716 {
7717     /* Searches the inversion list for the entry that contains the input code
7718      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7719      * return value is the index into the list's array of the range that
7720      * contains <cp> */
7721
7722     IV low = 0;
7723     IV mid;
7724     IV high = _invlist_len(invlist);
7725     const IV highest_element = high - 1;
7726     const UV* array;
7727
7728     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7729
7730     /* If list is empty, return failure. */
7731     if (high == 0) {
7732         return -1;
7733     }
7734
7735     /* (We can't get the array unless we know the list is non-empty) */
7736     array = invlist_array(invlist);
7737
7738     mid = invlist_previous_index(invlist);
7739     assert(mid >=0 && mid <= highest_element);
7740
7741     /* <mid> contains the cache of the result of the previous call to this
7742      * function (0 the first time).  See if this call is for the same result,
7743      * or if it is for mid-1.  This is under the theory that calls to this
7744      * function will often be for related code points that are near each other.
7745      * And benchmarks show that caching gives better results.  We also test
7746      * here if the code point is within the bounds of the list.  These tests
7747      * replace others that would have had to be made anyway to make sure that
7748      * the array bounds were not exceeded, and these give us extra information
7749      * at the same time */
7750     if (cp >= array[mid]) {
7751         if (cp >= array[highest_element]) {
7752             return highest_element;
7753         }
7754
7755         /* Here, array[mid] <= cp < array[highest_element].  This means that
7756          * the final element is not the answer, so can exclude it; it also
7757          * means that <mid> is not the final element, so can refer to 'mid + 1'
7758          * safely */
7759         if (cp < array[mid + 1]) {
7760             return mid;
7761         }
7762         high--;
7763         low = mid + 1;
7764     }
7765     else { /* cp < aray[mid] */
7766         if (cp < array[0]) { /* Fail if outside the array */
7767             return -1;
7768         }
7769         high = mid;
7770         if (cp >= array[mid - 1]) {
7771             goto found_entry;
7772         }
7773     }
7774
7775     /* Binary search.  What we are looking for is <i> such that
7776      *  array[i] <= cp < array[i+1]
7777      * The loop below converges on the i+1.  Note that there may not be an
7778      * (i+1)th element in the array, and things work nonetheless */
7779     while (low < high) {
7780         mid = (low + high) / 2;
7781         assert(mid <= highest_element);
7782         if (array[mid] <= cp) { /* cp >= array[mid] */
7783             low = mid + 1;
7784
7785             /* We could do this extra test to exit the loop early.
7786             if (cp < array[low]) {
7787                 return mid;
7788             }
7789             */
7790         }
7791         else { /* cp < array[mid] */
7792             high = mid;
7793         }
7794     }
7795
7796   found_entry:
7797     high--;
7798     invlist_set_previous_index(invlist, high);
7799     return high;
7800 }
7801
7802 void
7803 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7804 {
7805     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7806      * but is used when the swash has an inversion list.  This makes this much
7807      * faster, as it uses a binary search instead of a linear one.  This is
7808      * intimately tied to that function, and perhaps should be in utf8.c,
7809      * except it is intimately tied to inversion lists as well.  It assumes
7810      * that <swatch> is all 0's on input */
7811
7812     UV current = start;
7813     const IV len = _invlist_len(invlist);
7814     IV i;
7815     const UV * array;
7816
7817     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7818
7819     if (len == 0) { /* Empty inversion list */
7820         return;
7821     }
7822
7823     array = invlist_array(invlist);
7824
7825     /* Find which element it is */
7826     i = _invlist_search(invlist, start);
7827
7828     /* We populate from <start> to <end> */
7829     while (current < end) {
7830         UV upper;
7831
7832         /* The inversion list gives the results for every possible code point
7833          * after the first one in the list.  Only those ranges whose index is
7834          * even are ones that the inversion list matches.  For the odd ones,
7835          * and if the initial code point is not in the list, we have to skip
7836          * forward to the next element */
7837         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7838             i++;
7839             if (i >= len) { /* Finished if beyond the end of the array */
7840                 return;
7841             }
7842             current = array[i];
7843             if (current >= end) {   /* Finished if beyond the end of what we
7844                                        are populating */
7845                 if (LIKELY(end < UV_MAX)) {
7846                     return;
7847                 }
7848
7849                 /* We get here when the upper bound is the maximum
7850                  * representable on the machine, and we are looking for just
7851                  * that code point.  Have to special case it */
7852                 i = len;
7853                 goto join_end_of_list;
7854             }
7855         }
7856         assert(current >= start);
7857
7858         /* The current range ends one below the next one, except don't go past
7859          * <end> */
7860         i++;
7861         upper = (i < len && array[i] < end) ? array[i] : end;
7862
7863         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7864          * for each code point in it */
7865         for (; current < upper; current++) {
7866             const STRLEN offset = (STRLEN)(current - start);
7867             swatch[offset >> 3] |= 1 << (offset & 7);
7868         }
7869
7870     join_end_of_list:
7871
7872         /* Quit if at the end of the list */
7873         if (i >= len) {
7874
7875             /* But first, have to deal with the highest possible code point on
7876              * the platform.  The previous code assumes that <end> is one
7877              * beyond where we want to populate, but that is impossible at the
7878              * platform's infinity, so have to handle it specially */
7879             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7880             {
7881                 const STRLEN offset = (STRLEN)(end - start);
7882                 swatch[offset >> 3] |= 1 << (offset & 7);
7883             }
7884             return;
7885         }
7886
7887         /* Advance to the next range, which will be for code points not in the
7888          * inversion list */
7889         current = array[i];
7890     }
7891
7892     return;
7893 }
7894
7895 void
7896 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7897 {
7898     /* Take the union of two inversion lists and point <output> to it.  *output
7899      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7900      * the reference count to that list will be decremented if not already a
7901      * temporary (mortal); otherwise *output will be made correspondingly
7902      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
7903      * second list is returned.  If <complement_b> is TRUE, the union is taken
7904      * of the complement (inversion) of <b> instead of b itself.
7905      *
7906      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7907      * Richard Gillam, published by Addison-Wesley, and explained at some
7908      * length there.  The preface says to incorporate its examples into your
7909      * code at your own risk.
7910      *
7911      * The algorithm is like a merge sort.
7912      *
7913      * XXX A potential performance improvement is to keep track as we go along
7914      * if only one of the inputs contributes to the result, meaning the other
7915      * is a subset of that one.  In that case, we can skip the final copy and
7916      * return the larger of the input lists, but then outside code might need
7917      * to keep track of whether to free the input list or not */
7918
7919     const UV* array_a;    /* a's array */
7920     const UV* array_b;
7921     UV len_a;       /* length of a's array */
7922     UV len_b;
7923
7924     SV* u;                      /* the resulting union */
7925     UV* array_u;
7926     UV len_u;
7927
7928     UV i_a = 0;             /* current index into a's array */
7929     UV i_b = 0;
7930     UV i_u = 0;
7931
7932     /* running count, as explained in the algorithm source book; items are
7933      * stopped accumulating and are output when the count changes to/from 0.
7934      * The count is incremented when we start a range that's in the set, and
7935      * decremented when we start a range that's not in the set.  So its range
7936      * is 0 to 2.  Only when the count is zero is something not in the set.
7937      */
7938     UV count = 0;
7939
7940     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7941     assert(a != b);
7942
7943     /* If either one is empty, the union is the other one */
7944     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7945         bool make_temp = FALSE; /* Should we mortalize the result? */
7946
7947         if (*output == a) {
7948             if (a != NULL) {
7949                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
7950                     SvREFCNT_dec_NN(a);
7951                 }
7952             }
7953         }
7954         if (*output != b) {
7955             *output = invlist_clone(b);
7956             if (complement_b) {
7957                 _invlist_invert(*output);
7958             }
7959         } /* else *output already = b; */
7960
7961         if (make_temp) {
7962             sv_2mortal(*output);
7963         }
7964         return;
7965     }
7966     else if ((len_b = _invlist_len(b)) == 0) {
7967         bool make_temp = FALSE;
7968         if (*output == b) {
7969             if (! (make_temp = cBOOL(SvTEMP(b)))) {
7970                 SvREFCNT_dec_NN(b);
7971             }
7972         }
7973
7974         /* The complement of an empty list is a list that has everything in it,
7975          * so the union with <a> includes everything too */
7976         if (complement_b) {
7977             if (a == *output) {
7978                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
7979                     SvREFCNT_dec_NN(a);
7980                 }
7981             }
7982             *output = _new_invlist(1);
7983             _append_range_to_invlist(*output, 0, UV_MAX);
7984         }
7985         else if (*output != a) {
7986             *output = invlist_clone(a);
7987         }
7988         /* else *output already = a; */
7989
7990         if (make_temp) {
7991             sv_2mortal(*output);
7992         }
7993         return;
7994     }
7995
7996     /* Here both lists exist and are non-empty */
7997     array_a = invlist_array(a);
7998     array_b = invlist_array(b);
7999
8000     /* If are to take the union of 'a' with the complement of b, set it
8001      * up so are looking at b's complement. */
8002     if (complement_b) {
8003
8004         /* To complement, we invert: if the first element is 0, remove it.  To
8005          * do this, we just pretend the array starts one later */
8006         if (array_b[0] == 0) {
8007             array_b++;
8008             len_b--;
8009         }
8010         else {
8011
8012             /* But if the first element is not zero, we pretend the list starts
8013              * at the 0 that is always stored immediately before the array. */
8014             array_b--;
8015             len_b++;
8016         }
8017     }
8018
8019     /* Size the union for the worst case: that the sets are completely
8020      * disjoint */
8021     u = _new_invlist(len_a + len_b);
8022
8023     /* Will contain U+0000 if either component does */
8024     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8025                                       || (len_b > 0 && array_b[0] == 0));
8026
8027     /* Go through each list item by item, stopping when exhausted one of
8028      * them */
8029     while (i_a < len_a && i_b < len_b) {
8030         UV cp;      /* The element to potentially add to the union's array */
8031         bool cp_in_set;   /* is it in the the input list's set or not */
8032
8033         /* We need to take one or the other of the two inputs for the union.
8034          * Since we are merging two sorted lists, we take the smaller of the
8035          * next items.  In case of a tie, we take the one that is in its set
8036          * first.  If we took one not in the set first, it would decrement the
8037          * count, possibly to 0 which would cause it to be output as ending the
8038          * range, and the next time through we would take the same number, and
8039          * output it again as beginning the next range.  By doing it the
8040          * opposite way, there is no possibility that the count will be
8041          * momentarily decremented to 0, and thus the two adjoining ranges will
8042          * be seamlessly merged.  (In a tie and both are in the set or both not
8043          * in the set, it doesn't matter which we take first.) */
8044         if (array_a[i_a] < array_b[i_b]
8045             || (array_a[i_a] == array_b[i_b]
8046                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8047         {
8048             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8049             cp= array_a[i_a++];
8050         }
8051         else {
8052             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8053             cp = array_b[i_b++];
8054         }
8055
8056         /* Here, have chosen which of the two inputs to look at.  Only output
8057          * if the running count changes to/from 0, which marks the
8058          * beginning/end of a range in that's in the set */
8059         if (cp_in_set) {
8060             if (count == 0) {
8061                 array_u[i_u++] = cp;
8062             }
8063             count++;
8064         }
8065         else {
8066             count--;
8067             if (count == 0) {
8068                 array_u[i_u++] = cp;
8069             }
8070         }
8071     }
8072
8073     /* Here, we are finished going through at least one of the lists, which
8074      * means there is something remaining in at most one.  We check if the list
8075      * that hasn't been exhausted is positioned such that we are in the middle
8076      * of a range in its set or not.  (i_a and i_b point to the element beyond
8077      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8078      * is potentially more to output.
8079      * There are four cases:
8080      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8081      *     in the union is entirely from the non-exhausted set.
8082      *  2) Both were in their sets, count is 2.  Nothing further should
8083      *     be output, as everything that remains will be in the exhausted
8084      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8085      *     that
8086      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8087      *     Nothing further should be output because the union includes
8088      *     everything from the exhausted set.  Not decrementing ensures that.
8089      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8090      *     decrementing to 0 insures that we look at the remainder of the
8091      *     non-exhausted set */
8092     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8093         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8094     {
8095         count--;
8096     }
8097
8098     /* The final length is what we've output so far, plus what else is about to
8099      * be output.  (If 'count' is non-zero, then the input list we exhausted
8100      * has everything remaining up to the machine's limit in its set, and hence
8101      * in the union, so there will be no further output. */
8102     len_u = i_u;
8103     if (count == 0) {
8104         /* At most one of the subexpressions will be non-zero */
8105         len_u += (len_a - i_a) + (len_b - i_b);
8106     }
8107
8108     /* Set result to final length, which can change the pointer to array_u, so
8109      * re-find it */
8110     if (len_u != _invlist_len(u)) {
8111         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8112         invlist_trim(u);
8113         array_u = invlist_array(u);
8114     }
8115
8116     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8117      * the other) ended with everything above it not in its set.  That means
8118      * that the remaining part of the union is precisely the same as the
8119      * non-exhausted list, so can just copy it unchanged.  (If both list were
8120      * exhausted at the same time, then the operations below will be both 0.)
8121      */
8122     if (count == 0) {
8123         IV copy_count; /* At most one will have a non-zero copy count */
8124         if ((copy_count = len_a - i_a) > 0) {
8125             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8126         }
8127         else if ((copy_count = len_b - i_b) > 0) {
8128             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8129         }
8130     }
8131
8132     /*  We may be removing a reference to one of the inputs.  If so, the output
8133      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8134      *  count decremented) */
8135     if (a == *output || b == *output) {
8136         assert(! invlist_is_iterating(*output));
8137         if ((SvTEMP(*output))) {
8138             sv_2mortal(u);
8139         }
8140         else {
8141             SvREFCNT_dec_NN(*output);
8142         }
8143     }
8144
8145     *output = u;
8146
8147     return;
8148 }
8149
8150 void
8151 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
8152 {
8153     /* Take the intersection of two inversion lists and point <i> to it.  *i
8154      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8155      * the reference count to that list will be decremented if not already a
8156      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8157      * The first list, <a>, may be NULL, in which case an empty list is
8158      * returned.  If <complement_b> is TRUE, the result will be the
8159      * intersection of <a> and the complement (or inversion) of <b> instead of
8160      * <b> directly.
8161      *
8162      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8163      * Richard Gillam, published by Addison-Wesley, and explained at some
8164      * length there.  The preface says to incorporate its examples into your
8165      * code at your own risk.  In fact, it had bugs
8166      *
8167      * The algorithm is like a merge sort, and is essentially the same as the
8168      * union above
8169      */
8170
8171     const UV* array_a;          /* a's array */
8172     const UV* array_b;
8173     UV len_a;   /* length of a's array */
8174     UV len_b;
8175
8176     SV* r;                   /* the resulting intersection */
8177     UV* array_r;
8178     UV len_r;
8179
8180     UV i_a = 0;             /* current index into a's array */
8181     UV i_b = 0;
8182     UV i_r = 0;
8183
8184     /* running count, as explained in the algorithm source book; items are
8185      * stopped accumulating and are output when the count changes to/from 2.
8186      * The count is incremented when we start a range that's in the set, and
8187      * decremented when we start a range that's not in the set.  So its range
8188      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8189      */
8190     UV count = 0;
8191
8192     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8193     assert(a != b);
8194
8195     /* Special case if either one is empty */
8196     len_a = (a == NULL) ? 0 : _invlist_len(a);
8197     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8198         bool make_temp = FALSE;
8199
8200         if (len_a != 0 && complement_b) {
8201
8202             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8203              * be empty.  Here, also we are using 'b's complement, which hence
8204              * must be every possible code point.  Thus the intersection is
8205              * simply 'a'. */
8206             if (*i != a) {
8207                 if (*i == b) {
8208                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8209                         SvREFCNT_dec_NN(b);
8210                     }
8211                 }
8212
8213                 *i = invlist_clone(a);
8214             }
8215             /* else *i is already 'a' */
8216
8217             if (make_temp) {
8218                 sv_2mortal(*i);
8219             }
8220             return;
8221         }
8222
8223         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8224          * intersection must be empty */
8225         if (*i == a) {
8226             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8227                 SvREFCNT_dec_NN(a);
8228             }
8229         }
8230         else if (*i == b) {
8231             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8232                 SvREFCNT_dec_NN(b);
8233             }
8234         }
8235         *i = _new_invlist(0);
8236         if (make_temp) {
8237             sv_2mortal(*i);
8238         }
8239
8240         return;
8241     }
8242
8243     /* Here both lists exist and are non-empty */
8244     array_a = invlist_array(a);
8245     array_b = invlist_array(b);
8246
8247     /* If are to take the intersection of 'a' with the complement of b, set it
8248      * up so are looking at b's complement. */
8249     if (complement_b) {
8250
8251         /* To complement, we invert: if the first element is 0, remove it.  To
8252          * do this, we just pretend the array starts one later */
8253         if (array_b[0] == 0) {
8254             array_b++;
8255             len_b--;
8256         }
8257         else {
8258
8259             /* But if the first element is not zero, we pretend the list starts
8260              * at the 0 that is always stored immediately before the array. */
8261             array_b--;
8262             len_b++;
8263         }
8264     }
8265
8266     /* Size the intersection for the worst case: that the intersection ends up
8267      * fragmenting everything to be completely disjoint */
8268     r= _new_invlist(len_a + len_b);
8269
8270     /* Will contain U+0000 iff both components do */
8271     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8272                                      && len_b > 0 && array_b[0] == 0);
8273
8274     /* Go through each list item by item, stopping when exhausted one of
8275      * them */
8276     while (i_a < len_a && i_b < len_b) {
8277         UV cp;      /* The element to potentially add to the intersection's
8278                        array */
8279         bool cp_in_set; /* Is it in the input list's set or not */
8280
8281         /* We need to take one or the other of the two inputs for the
8282          * intersection.  Since we are merging two sorted lists, we take the
8283          * smaller of the next items.  In case of a tie, we take the one that
8284          * is not in its set first (a difference from the union algorithm).  If
8285          * we took one in the set first, it would increment the count, possibly
8286          * to 2 which would cause it to be output as starting a range in the
8287          * intersection, and the next time through we would take that same
8288          * number, and output it again as ending the set.  By doing it the
8289          * opposite of this, there is no possibility that the count will be
8290          * momentarily incremented to 2.  (In a tie and both are in the set or
8291          * both not in the set, it doesn't matter which we take first.) */
8292         if (array_a[i_a] < array_b[i_b]
8293             || (array_a[i_a] == array_b[i_b]
8294                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8295         {
8296             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8297             cp= array_a[i_a++];
8298         }
8299         else {
8300             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8301             cp= array_b[i_b++];
8302         }
8303
8304         /* Here, have chosen which of the two inputs to look at.  Only output
8305          * if the running count changes to/from 2, which marks the
8306          * beginning/end of a range that's in the intersection */
8307         if (cp_in_set) {
8308             count++;
8309             if (count == 2) {
8310                 array_r[i_r++] = cp;
8311             }
8312         }
8313         else {
8314             if (count == 2) {
8315                 array_r[i_r++] = cp;
8316             }
8317             count--;
8318         }
8319     }
8320
8321     /* Here, we are finished going through at least one of the lists, which
8322      * means there is something remaining in at most one.  We check if the list
8323      * that has been exhausted is positioned such that we are in the middle
8324      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8325      * the ones we care about.)  There are four cases:
8326      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8327      *     nothing left in the intersection.
8328      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8329      *     above 2.  What should be output is exactly that which is in the
8330      *     non-exhausted set, as everything it has is also in the intersection
8331      *     set, and everything it doesn't have can't be in the intersection
8332      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8333      *     gets incremented to 2.  Like the previous case, the intersection is
8334      *     everything that remains in the non-exhausted set.
8335      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8336      *     remains 1.  And the intersection has nothing more. */
8337     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8338         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8339     {
8340         count++;
8341     }
8342
8343     /* The final length is what we've output so far plus what else is in the
8344      * intersection.  At most one of the subexpressions below will be non-zero */
8345     len_r = i_r;
8346     if (count >= 2) {
8347         len_r += (len_a - i_a) + (len_b - i_b);
8348     }
8349
8350     /* Set result to final length, which can change the pointer to array_r, so
8351      * re-find it */
8352     if (len_r != _invlist_len(r)) {
8353         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8354         invlist_trim(r);
8355         array_r = invlist_array(r);
8356     }
8357
8358     /* Finish outputting any remaining */
8359     if (count >= 2) { /* At most one will have a non-zero copy count */
8360         IV copy_count;
8361         if ((copy_count = len_a - i_a) > 0) {
8362             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8363         }
8364         else if ((copy_count = len_b - i_b) > 0) {
8365             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8366         }
8367     }
8368
8369     /*  We may be removing a reference to one of the inputs.  If so, the output
8370      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8371      *  count decremented) */
8372     if (a == *i || b == *i) {
8373         assert(! invlist_is_iterating(*i));
8374         if (SvTEMP(*i)) {
8375             sv_2mortal(r);
8376         }
8377         else {
8378             SvREFCNT_dec_NN(*i);
8379         }
8380     }
8381
8382     *i = r;
8383
8384     return;
8385 }
8386
8387 SV*
8388 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8389 {
8390     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8391      * set.  A pointer to the inversion list is returned.  This may actually be
8392      * a new list, in which case the passed in one has been destroyed.  The
8393      * passed in inversion list can be NULL, in which case a new one is created
8394      * with just the one range in it */
8395
8396     SV* range_invlist;
8397     UV len;
8398
8399     if (invlist == NULL) {
8400         invlist = _new_invlist(2);
8401         len = 0;
8402     }
8403     else {
8404         len = _invlist_len(invlist);
8405     }
8406
8407     /* If comes after the final entry actually in the list, can just append it
8408      * to the end, */
8409     if (len == 0
8410         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8411             && start >= invlist_array(invlist)[len - 1]))
8412     {
8413         _append_range_to_invlist(invlist, start, end);
8414         return invlist;
8415     }
8416
8417     /* Here, can't just append things, create and return a new inversion list
8418      * which is the union of this range and the existing inversion list */
8419     range_invlist = _new_invlist(2);
8420     _append_range_to_invlist(range_invlist, start, end);
8421
8422     _invlist_union(invlist, range_invlist, &invlist);
8423
8424     /* The temporary can be freed */
8425     SvREFCNT_dec_NN(range_invlist);
8426
8427     return invlist;
8428 }
8429
8430 #endif
8431
8432 PERL_STATIC_INLINE SV*
8433 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8434     return _add_range_to_invlist(invlist, cp, cp);
8435 }
8436
8437 #ifndef PERL_IN_XSUB_RE
8438 void
8439 Perl__invlist_invert(pTHX_ SV* const invlist)
8440 {
8441     /* Complement the input inversion list.  This adds a 0 if the list didn't
8442      * have a zero; removes it otherwise.  As described above, the data
8443      * structure is set up so that this is very efficient */
8444
8445     PERL_ARGS_ASSERT__INVLIST_INVERT;
8446
8447     assert(! invlist_is_iterating(invlist));
8448
8449     /* The inverse of matching nothing is matching everything */
8450     if (_invlist_len(invlist) == 0) {
8451         _append_range_to_invlist(invlist, 0, UV_MAX);
8452         return;
8453     }
8454
8455     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8456 }
8457
8458 void
8459 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8460 {
8461     /* Complement the input inversion list (which must be a Unicode property,
8462      * all of which don't match above the Unicode maximum code point.)  And
8463      * Perl has chosen to not have the inversion match above that either.  This
8464      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8465      */
8466
8467     UV len;
8468     UV* array;
8469
8470     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8471
8472     _invlist_invert(invlist);
8473
8474     len = _invlist_len(invlist);
8475
8476     if (len != 0) { /* If empty do nothing */
8477         array = invlist_array(invlist);
8478         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8479             /* Add 0x110000.  First, grow if necessary */
8480             len++;
8481             if (invlist_max(invlist) < len) {
8482                 invlist_extend(invlist, len);
8483                 array = invlist_array(invlist);
8484             }
8485             invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8486             array[len - 1] = PERL_UNICODE_MAX + 1;
8487         }
8488         else {  /* Remove the 0x110000 */
8489             invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8490         }
8491     }
8492
8493     return;
8494 }
8495 #endif
8496
8497 PERL_STATIC_INLINE SV*
8498 S_invlist_clone(pTHX_ SV* const invlist)
8499 {
8500
8501     /* Return a new inversion list that is a copy of the input one, which is
8502      * unchanged.  The new list will not be mortal even if the old one was. */
8503
8504     /* Need to allocate extra space to accommodate Perl's addition of a
8505      * trailing NUL to SvPV's, since it thinks they are always strings */
8506     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8507     STRLEN physical_length = SvCUR(invlist);
8508     bool offset = *(get_invlist_offset_addr(invlist));
8509
8510     PERL_ARGS_ASSERT_INVLIST_CLONE;
8511
8512     *(get_invlist_offset_addr(new_invlist)) = offset;
8513     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8514     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8515
8516     return new_invlist;
8517 }
8518
8519 PERL_STATIC_INLINE STRLEN*
8520 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8521 {
8522     /* Return the address of the UV that contains the current iteration
8523      * position */
8524
8525     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8526
8527     assert(SvTYPE(invlist) == SVt_INVLIST);
8528
8529     return &(((XINVLIST*) SvANY(invlist))->iterator);
8530 }
8531
8532 PERL_STATIC_INLINE void
8533 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8534 {
8535     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8536
8537     *get_invlist_iter_addr(invlist) = 0;
8538 }
8539
8540 PERL_STATIC_INLINE void
8541 S_invlist_iterfinish(pTHX_ SV* invlist)
8542 {
8543     /* Terminate iterator for invlist.  This is to catch development errors.
8544      * Any iteration that is interrupted before completed should call this
8545      * function.  Functions that add code points anywhere else but to the end
8546      * of an inversion list assert that they are not in the middle of an
8547      * iteration.  If they were, the addition would make the iteration
8548      * problematical: if the iteration hadn't reached the place where things
8549      * were being added, it would be ok */
8550
8551     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8552
8553     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8554 }
8555
8556 STATIC bool
8557 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8558 {
8559     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8560      * This call sets in <*start> and <*end>, the next range in <invlist>.
8561      * Returns <TRUE> if successful and the next call will return the next
8562      * range; <FALSE> if was already at the end of the list.  If the latter,
8563      * <*start> and <*end> are unchanged, and the next call to this function
8564      * will start over at the beginning of the list */
8565
8566     STRLEN* pos = get_invlist_iter_addr(invlist);
8567     UV len = _invlist_len(invlist);
8568     UV *array;
8569
8570     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8571
8572     if (*pos >= len) {
8573         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8574         return FALSE;
8575     }
8576
8577     array = invlist_array(invlist);
8578
8579     *start = array[(*pos)++];
8580
8581     if (*pos >= len) {
8582         *end = UV_MAX;
8583     }
8584     else {
8585         *end = array[(*pos)++] - 1;
8586     }
8587
8588     return TRUE;
8589 }
8590
8591 PERL_STATIC_INLINE bool
8592 S_invlist_is_iterating(pTHX_ SV* const invlist)
8593 {
8594     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8595
8596     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8597 }
8598
8599 PERL_STATIC_INLINE UV
8600 S_invlist_highest(pTHX_ SV* const invlist)
8601 {
8602     /* Returns the highest code point that matches an inversion list.  This API
8603      * has an ambiguity, as it returns 0 under either the highest is actually
8604      * 0, or if the list is empty.  If this distinction matters to you, check
8605      * for emptiness before calling this function */
8606
8607     UV len = _invlist_len(invlist);
8608     UV *array;
8609
8610     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8611
8612     if (len == 0) {
8613         return 0;
8614     }
8615
8616     array = invlist_array(invlist);
8617
8618     /* The last element in the array in the inversion list always starts a
8619      * range that goes to infinity.  That range may be for code points that are
8620      * matched in the inversion list, or it may be for ones that aren't
8621      * matched.  In the latter case, the highest code point in the set is one
8622      * less than the beginning of this range; otherwise it is the final element
8623      * of this range: infinity */
8624     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8625            ? UV_MAX
8626            : array[len - 1] - 1;
8627 }
8628
8629 #ifndef PERL_IN_XSUB_RE
8630 SV *
8631 Perl__invlist_contents(pTHX_ SV* const invlist)
8632 {
8633     /* Get the contents of an inversion list into a string SV so that they can
8634      * be printed out.  It uses the format traditionally done for debug tracing
8635      */
8636
8637     UV start, end;
8638     SV* output = newSVpvs("\n");
8639
8640     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8641
8642     assert(! invlist_is_iterating(invlist));
8643
8644     invlist_iterinit(invlist);
8645     while (invlist_iternext(invlist, &start, &end)) {
8646         if (end == UV_MAX) {
8647             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8648         }
8649         else if (end != start) {
8650             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8651                     start,       end);
8652         }
8653         else {
8654             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8655         }
8656     }
8657
8658     return output;
8659 }
8660 #endif
8661
8662 #ifndef PERL_IN_XSUB_RE
8663 void
8664 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8665 {
8666     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8667      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8668      * the string 'indent'.  The output looks like this:
8669          [0] 0x000A .. 0x000D
8670          [2] 0x0085
8671          [4] 0x2028 .. 0x2029
8672          [6] 0x3104 .. INFINITY
8673      * This means that the first range of code points matched by the list are
8674      * 0xA through 0xD; the second range contains only the single code point
8675      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8676      * are used to define each range (except if the final range extends to
8677      * infinity, only a single element is needed).  The array index of the
8678      * first element for the corresponding range is given in brackets. */
8679
8680     UV start, end;
8681     STRLEN count = 0;
8682
8683     PERL_ARGS_ASSERT__INVLIST_DUMP;
8684
8685     if (invlist_is_iterating(invlist)) {
8686         Perl_dump_indent(aTHX_ level, file,
8687              "%sCan't dump inversion list because is in middle of iterating\n",
8688              indent);
8689         return;
8690     }
8691
8692     invlist_iterinit(invlist);
8693     while (invlist_iternext(invlist, &start, &end)) {
8694         if (end == UV_MAX) {
8695             Perl_dump_indent(aTHX_ level, file,
8696                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8697                                    indent, (UV)count, start);
8698         }
8699         else if (end != start) {
8700             Perl_dump_indent(aTHX_ level, file,
8701                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8702                                 indent, (UV)count, start,         end);
8703         }
8704         else {
8705             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8706                                             indent, (UV)count, start);
8707         }
8708         count += 2;
8709     }
8710 }
8711 #endif
8712
8713 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8714 bool
8715 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8716 {
8717     /* Return a boolean as to if the two passed in inversion lists are
8718      * identical.  The final argument, if TRUE, says to take the complement of
8719      * the second inversion list before doing the comparison */
8720
8721     const UV* array_a = invlist_array(a);
8722     const UV* array_b = invlist_array(b);
8723     UV len_a = _invlist_len(a);
8724     UV len_b = _invlist_len(b);
8725
8726     UV i = 0;               /* current index into the arrays */
8727     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8728
8729     PERL_ARGS_ASSERT__INVLISTEQ;
8730
8731     /* If are to compare 'a' with the complement of b, set it
8732      * up so are looking at b's complement. */
8733     if (complement_b) {
8734
8735         /* The complement of nothing is everything, so <a> would have to have
8736          * just one element, starting at zero (ending at infinity) */
8737         if (len_b == 0) {
8738             return (len_a == 1 && array_a[0] == 0);
8739         }
8740         else if (array_b[0] == 0) {
8741
8742             /* Otherwise, to complement, we invert.  Here, the first element is
8743              * 0, just remove it.  To do this, we just pretend the array starts
8744              * one later */
8745
8746             array_b++;
8747             len_b--;
8748         }
8749         else {
8750
8751             /* But if the first element is not zero, we pretend the list starts
8752              * at the 0 that is always stored immediately before the array. */
8753             array_b--;
8754             len_b++;
8755         }
8756     }
8757
8758     /* Make sure that the lengths are the same, as well as the final element
8759      * before looping through the remainder.  (Thus we test the length, final,
8760      * and first elements right off the bat) */
8761     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8762         retval = FALSE;
8763     }
8764     else for (i = 0; i < len_a - 1; i++) {
8765         if (array_a[i] != array_b[i]) {
8766             retval = FALSE;
8767             break;
8768         }
8769     }
8770
8771     return retval;
8772 }
8773 #endif
8774
8775 #undef HEADER_LENGTH
8776 #undef TO_INTERNAL_SIZE
8777 #undef FROM_INTERNAL_SIZE
8778 #undef INVLIST_VERSION_ID
8779
8780 /* End of inversion list object */
8781
8782 STATIC void
8783 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
8784 {
8785     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8786      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8787      * should point to the first flag; it is updated on output to point to the
8788      * final ')' or ':'.  There needs to be at least one flag, or this will
8789      * abort */
8790
8791     /* for (?g), (?gc), and (?o) warnings; warning
8792        about (?c) will warn about (?g) -- japhy    */
8793
8794 #define WASTED_O  0x01
8795 #define WASTED_G  0x02
8796 #define WASTED_C  0x04
8797 #define WASTED_GC (WASTED_G|WASTED_C)
8798     I32 wastedflags = 0x00;
8799     U32 posflags = 0, negflags = 0;
8800     U32 *flagsp = &posflags;
8801     char has_charset_modifier = '\0';
8802     regex_charset cs;
8803     bool has_use_defaults = FALSE;
8804     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8805
8806     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8807
8808     /* '^' as an initial flag sets certain defaults */
8809     if (UCHARAT(RExC_parse) == '^') {
8810         RExC_parse++;
8811         has_use_defaults = TRUE;
8812         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8813         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8814                                         ? REGEX_UNICODE_CHARSET
8815                                         : REGEX_DEPENDS_CHARSET);
8816     }
8817
8818     cs = get_regex_charset(RExC_flags);
8819     if (cs == REGEX_DEPENDS_CHARSET
8820         && (RExC_utf8 || RExC_uni_semantics))
8821     {
8822         cs = REGEX_UNICODE_CHARSET;
8823     }
8824
8825     while (*RExC_parse) {
8826         /* && strchr("iogcmsx", *RExC_parse) */
8827         /* (?g), (?gc) and (?o) are useless here
8828            and must be globally applied -- japhy */
8829         switch (*RExC_parse) {
8830
8831             /* Code for the imsx flags */
8832             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8833
8834             case LOCALE_PAT_MOD:
8835                 if (has_charset_modifier) {
8836                     goto excess_modifier;
8837                 }
8838                 else if (flagsp == &negflags) {
8839                     goto neg_modifier;
8840                 }
8841                 cs = REGEX_LOCALE_CHARSET;
8842                 has_charset_modifier = LOCALE_PAT_MOD;
8843                 RExC_contains_locale = 1;
8844                 break;
8845             case UNICODE_PAT_MOD:
8846                 if (has_charset_modifier) {
8847                     goto excess_modifier;
8848                 }
8849                 else if (flagsp == &negflags) {
8850                     goto neg_modifier;
8851                 }
8852                 cs = REGEX_UNICODE_CHARSET;
8853                 has_charset_modifier = UNICODE_PAT_MOD;
8854                 break;
8855             case ASCII_RESTRICT_PAT_MOD:
8856                 if (flagsp == &negflags) {
8857                     goto neg_modifier;
8858                 }
8859                 if (has_charset_modifier) {
8860                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8861                         goto excess_modifier;
8862                     }
8863                     /* Doubled modifier implies more restricted */
8864                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8865                 }
8866                 else {
8867                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8868                 }
8869                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8870                 break;
8871             case DEPENDS_PAT_MOD:
8872                 if (has_use_defaults) {
8873                     goto fail_modifiers;
8874                 }
8875                 else if (flagsp == &negflags) {
8876                     goto neg_modifier;
8877                 }
8878                 else if (has_charset_modifier) {
8879                     goto excess_modifier;
8880                 }
8881
8882                 /* The dual charset means unicode semantics if the
8883                  * pattern (or target, not known until runtime) are
8884                  * utf8, or something in the pattern indicates unicode
8885                  * semantics */
8886                 cs = (RExC_utf8 || RExC_uni_semantics)
8887                      ? REGEX_UNICODE_CHARSET
8888                      : REGEX_DEPENDS_CHARSET;
8889                 has_charset_modifier = DEPENDS_PAT_MOD;
8890                 break;
8891             excess_modifier:
8892                 RExC_parse++;
8893                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8894                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8895                 }
8896                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8897                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8898                 }
8899                 else {
8900                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8901                 }
8902                 /*NOTREACHED*/
8903             neg_modifier:
8904                 RExC_parse++;
8905                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8906                 /*NOTREACHED*/
8907             case ONCE_PAT_MOD: /* 'o' */
8908             case GLOBAL_PAT_MOD: /* 'g' */
8909                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8910                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8911                     if (! (wastedflags & wflagbit) ) {
8912                         wastedflags |= wflagbit;
8913                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8914                         vWARN5(
8915                             RExC_parse + 1,
8916                             "Useless (%s%c) - %suse /%c modifier",
8917                             flagsp == &negflags ? "?-" : "?",
8918                             *RExC_parse,
8919                             flagsp == &negflags ? "don't " : "",
8920                             *RExC_parse
8921                         );
8922                     }
8923                 }
8924                 break;
8925
8926             case CONTINUE_PAT_MOD: /* 'c' */
8927                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8928                     if (! (wastedflags & WASTED_C) ) {
8929                         wastedflags |= WASTED_GC;
8930                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8931                         vWARN3(
8932                             RExC_parse + 1,
8933                             "Useless (%sc) - %suse /gc modifier",
8934                             flagsp == &negflags ? "?-" : "?",
8935                             flagsp == &negflags ? "don't " : ""
8936                         );
8937                     }
8938                 }
8939                 break;
8940             case KEEPCOPY_PAT_MOD: /* 'p' */
8941                 if (flagsp == &negflags) {
8942                     if (SIZE_ONLY)
8943                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8944                 } else {
8945                     *flagsp |= RXf_PMf_KEEPCOPY;
8946                 }
8947                 break;
8948             case '-':
8949                 /* A flag is a default iff it is following a minus, so
8950                  * if there is a minus, it means will be trying to
8951                  * re-specify a default which is an error */
8952                 if (has_use_defaults || flagsp == &negflags) {
8953                     goto fail_modifiers;
8954                 }
8955                 flagsp = &negflags;
8956                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8957                 break;
8958             case ':':
8959             case ')':
8960                 RExC_flags |= posflags;
8961                 RExC_flags &= ~negflags;
8962                 set_regex_charset(&RExC_flags, cs);
8963                 if (RExC_flags & RXf_PMf_FOLD) {
8964                     RExC_contains_i = 1;
8965                 }
8966                 return;
8967                 /*NOTREACHED*/
8968             default:
8969             fail_modifiers:
8970                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
8971                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
8972                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
8973                 /*NOTREACHED*/
8974         }
8975
8976         ++RExC_parse;
8977     }
8978 }
8979
8980 /*
8981  - reg - regular expression, i.e. main body or parenthesized thing
8982  *
8983  * Caller must absorb opening parenthesis.
8984  *
8985  * Combining parenthesis handling with the base level of regular expression
8986  * is a trifle forced, but the need to tie the tails of the branches to what
8987  * follows makes it hard to avoid.
8988  */
8989 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8990 #ifdef DEBUGGING
8991 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8992 #else
8993 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8994 #endif
8995
8996 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8997    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8998    needs to be restarted.
8999    Otherwise would only return NULL if regbranch() returns NULL, which
9000    cannot happen.  */
9001 STATIC regnode *
9002 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9003     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9004      * 2 is like 1, but indicates that nextchar() has been called to advance
9005      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9006      * this flag alerts us to the need to check for that */
9007 {
9008     dVAR;
9009     regnode *ret;               /* Will be the head of the group. */
9010     regnode *br;
9011     regnode *lastbr;
9012     regnode *ender = NULL;
9013     I32 parno = 0;
9014     I32 flags;
9015     U32 oregflags = RExC_flags;
9016     bool have_branch = 0;
9017     bool is_open = 0;
9018     I32 freeze_paren = 0;
9019     I32 after_freeze = 0;
9020
9021     char * parse_start = RExC_parse; /* MJD */
9022     char * const oregcomp_parse = RExC_parse;
9023
9024     GET_RE_DEBUG_FLAGS_DECL;
9025
9026     PERL_ARGS_ASSERT_REG;
9027     DEBUG_PARSE("reg ");
9028
9029     *flagp = 0;                         /* Tentatively. */
9030
9031
9032     /* Make an OPEN node, if parenthesized. */
9033     if (paren) {
9034
9035         /* Under /x, space and comments can be gobbled up between the '(' and
9036          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9037          * intervening space, as the sequence is a token, and a token should be
9038          * indivisible */
9039         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9040
9041         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9042             char *start_verb = RExC_parse;
9043             STRLEN verb_len = 0;
9044             char *start_arg = NULL;
9045             unsigned char op = 0;
9046             int argok = 1;
9047             int internal_argval = 0; /* internal_argval is only useful if !argok */
9048
9049             if (has_intervening_patws && SIZE_ONLY) {
9050                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9051             }
9052             while ( *RExC_parse && *RExC_parse != ')' ) {
9053                 if ( *RExC_parse == ':' ) {
9054                     start_arg = RExC_parse + 1;
9055                     break;
9056                 }
9057                 RExC_parse++;
9058             }
9059             ++start_verb;
9060             verb_len = RExC_parse - start_verb;
9061             if ( start_arg ) {
9062                 RExC_parse++;
9063                 while ( *RExC_parse && *RExC_parse != ')' )
9064                     RExC_parse++;
9065                 if ( *RExC_parse != ')' )
9066                     vFAIL("Unterminated verb pattern argument");
9067                 if ( RExC_parse == start_arg )
9068                     start_arg = NULL;
9069             } else {
9070                 if ( *RExC_parse != ')' )
9071                     vFAIL("Unterminated verb pattern");
9072             }
9073
9074             switch ( *start_verb ) {
9075             case 'A':  /* (*ACCEPT) */
9076                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9077                     op = ACCEPT;
9078                     internal_argval = RExC_nestroot;
9079                 }
9080                 break;
9081             case 'C':  /* (*COMMIT) */
9082                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9083                     op = COMMIT;
9084                 break;
9085             case 'F':  /* (*FAIL) */
9086                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9087                     op = OPFAIL;
9088                     argok = 0;
9089                 }
9090                 break;
9091             case ':':  /* (*:NAME) */
9092             case 'M':  /* (*MARK:NAME) */
9093                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9094                     op = MARKPOINT;
9095                     argok = -1;
9096                 }
9097                 break;
9098             case 'P':  /* (*PRUNE) */
9099                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9100                     op = PRUNE;
9101                 break;
9102             case 'S':   /* (*SKIP) */
9103                 if ( memEQs(start_verb,verb_len,"SKIP") )
9104                     op = SKIP;
9105                 break;
9106             case 'T':  /* (*THEN) */
9107                 /* [19:06] <TimToady> :: is then */
9108                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9109                     op = CUTGROUP;
9110                     RExC_seen |= REG_SEEN_CUTGROUP;
9111                 }
9112                 break;
9113             }
9114             if ( ! op ) {
9115                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9116                 vFAIL2utf8f(
9117                     "Unknown verb pattern '%"UTF8f"'",
9118                     UTF8fARG(UTF, verb_len, start_verb));
9119             }
9120             if ( argok ) {
9121                 if ( start_arg && internal_argval ) {
9122                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9123                         verb_len, start_verb);
9124                 } else if ( argok < 0 && !start_arg ) {
9125                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9126                         verb_len, start_verb);
9127                 } else {
9128                     ret = reganode(pRExC_state, op, internal_argval);
9129                     if ( ! internal_argval && ! SIZE_ONLY ) {
9130                         if (start_arg) {
9131                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
9132                             ARG(ret) = add_data( pRExC_state, STR_WITH_LEN("S"));
9133                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9134                             ret->flags = 0;
9135                         } else {
9136                             ret->flags = 1;
9137                         }
9138                     }
9139                 }
9140                 if (!internal_argval)
9141                     RExC_seen |= REG_SEEN_VERBARG;
9142             } else if ( start_arg ) {
9143                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9144                         verb_len, start_verb);
9145             } else {
9146                 ret = reg_node(pRExC_state, op);
9147             }
9148             nextchar(pRExC_state);
9149             return ret;
9150         }
9151         else if (*RExC_parse == '?') { /* (?...) */
9152             bool is_logical = 0;
9153             const char * const seqstart = RExC_parse;
9154             if (has_intervening_patws && SIZE_ONLY) {
9155                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9156             }
9157
9158             RExC_parse++;
9159             paren = *RExC_parse++;
9160             ret = NULL;                 /* For look-ahead/behind. */
9161             switch (paren) {
9162
9163             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9164                 paren = *RExC_parse++;
9165                 if ( paren == '<')         /* (?P<...>) named capture */
9166                     goto named_capture;
9167                 else if (paren == '>') {   /* (?P>name) named recursion */
9168                     goto named_recursion;
9169                 }
9170                 else if (paren == '=') {   /* (?P=...)  named backref */
9171                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
9172                        you change this make sure you change that */
9173                     char* name_start = RExC_parse;
9174                     U32 num = 0;
9175                     SV *sv_dat = reg_scan_name(pRExC_state,
9176                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9177                     if (RExC_parse == name_start || *RExC_parse != ')')
9178                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9179
9180                     if (!SIZE_ONLY) {
9181                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9182                         RExC_rxi->data->data[num]=(void*)sv_dat;
9183                         SvREFCNT_inc_simple_void(sv_dat);
9184                     }
9185                     RExC_sawback = 1;
9186                     ret = reganode(pRExC_state,
9187                                    ((! FOLD)
9188                                      ? NREF
9189                                      : (ASCII_FOLD_RESTRICTED)
9190                                        ? NREFFA
9191                                        : (AT_LEAST_UNI_SEMANTICS)
9192                                          ? NREFFU
9193                                          : (LOC)
9194                                            ? NREFFL
9195                                            : NREFF),
9196                                     num);
9197                     *flagp |= HASWIDTH;
9198
9199                     Set_Node_Offset(ret, parse_start+1);
9200                     Set_Node_Cur_Length(ret, parse_start);
9201
9202                     nextchar(pRExC_state);
9203                     return ret;
9204                 }
9205                 RExC_parse++;
9206                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9207                 /*NOTREACHED*/
9208             case '<':           /* (?<...) */
9209                 if (*RExC_parse == '!')
9210                     paren = ',';
9211                 else if (*RExC_parse != '=')
9212               named_capture:
9213                 {               /* (?<...>) */
9214                     char *name_start;
9215                     SV *svname;
9216                     paren= '>';
9217             case '\'':          /* (?'...') */
9218                     name_start= RExC_parse;
9219                     svname = reg_scan_name(pRExC_state,
9220                         SIZE_ONLY ?  /* reverse test from the others */
9221                         REG_RSN_RETURN_NAME :
9222                         REG_RSN_RETURN_NULL);
9223                     if (RExC_parse == name_start) {
9224                         RExC_parse++;
9225                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9226                         /*NOTREACHED*/
9227                     }
9228                     if (*RExC_parse != paren)
9229                         vFAIL2("Sequence (?%c... not terminated",
9230                             paren=='>' ? '<' : paren);
9231                     if (SIZE_ONLY) {
9232                         HE *he_str;
9233                         SV *sv_dat = NULL;
9234                         if (!svname) /* shouldn't happen */
9235                             Perl_croak(aTHX_
9236                                 "panic: reg_scan_name returned NULL");
9237                         if (!RExC_paren_names) {
9238                             RExC_paren_names= newHV();
9239                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9240 #ifdef DEBUGGING
9241                             RExC_paren_name_list= newAV();
9242                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9243 #endif
9244                         }
9245                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9246                         if ( he_str )
9247                             sv_dat = HeVAL(he_str);
9248                         if ( ! sv_dat ) {
9249                             /* croak baby croak */
9250                             Perl_croak(aTHX_
9251                                 "panic: paren_name hash element allocation failed");
9252                         } else if ( SvPOK(sv_dat) ) {
9253                             /* (?|...) can mean we have dupes so scan to check
9254                                its already been stored. Maybe a flag indicating
9255                                we are inside such a construct would be useful,
9256                                but the arrays are likely to be quite small, so
9257                                for now we punt -- dmq */
9258                             IV count = SvIV(sv_dat);
9259                             I32 *pv = (I32*)SvPVX(sv_dat);
9260                             IV i;
9261                             for ( i = 0 ; i < count ; i++ ) {
9262                                 if ( pv[i] == RExC_npar ) {
9263                                     count = 0;
9264                                     break;
9265                                 }
9266                             }
9267                             if ( count ) {
9268                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
9269                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9270                                 pv[count] = RExC_npar;
9271                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9272                             }
9273                         } else {
9274                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9275                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
9276                             SvIOK_on(sv_dat);
9277                             SvIV_set(sv_dat, 1);
9278                         }
9279 #ifdef DEBUGGING
9280                         /* Yes this does cause a memory leak in debugging Perls */
9281                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
9282                             SvREFCNT_dec_NN(svname);
9283 #endif
9284
9285                         /*sv_dump(sv_dat);*/
9286                     }
9287                     nextchar(pRExC_state);
9288                     paren = 1;
9289                     goto capturing_parens;
9290                 }
9291                 RExC_seen |= REG_SEEN_LOOKBEHIND;
9292                 RExC_in_lookbehind++;
9293                 RExC_parse++;
9294             case '=':           /* (?=...) */
9295                 RExC_seen_zerolen++;
9296                 break;
9297             case '!':           /* (?!...) */
9298                 RExC_seen_zerolen++;
9299                 if (*RExC_parse == ')') {
9300                     ret=reg_node(pRExC_state, OPFAIL);
9301                     nextchar(pRExC_state);
9302                     return ret;
9303                 }
9304                 break;
9305             case '|':           /* (?|...) */
9306                 /* branch reset, behave like a (?:...) except that
9307                    buffers in alternations share the same numbers */
9308                 paren = ':';
9309                 after_freeze = freeze_paren = RExC_npar;
9310                 break;
9311             case ':':           /* (?:...) */
9312             case '>':           /* (?>...) */
9313                 break;
9314             case '$':           /* (?$...) */
9315             case '@':           /* (?@...) */
9316                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9317                 break;
9318             case '#':           /* (?#...) */
9319                 /* XXX As soon as we disallow separating the '?' and '*' (by
9320                  * spaces or (?#...) comment), it is believed that this case
9321                  * will be unreachable and can be removed.  See
9322                  * [perl #117327] */
9323                 while (*RExC_parse && *RExC_parse != ')')
9324                     RExC_parse++;
9325                 if (*RExC_parse != ')')
9326                     FAIL("Sequence (?#... not terminated");
9327                 nextchar(pRExC_state);
9328                 *flagp = TRYAGAIN;
9329                 return NULL;
9330             case '0' :           /* (?0) */
9331             case 'R' :           /* (?R) */
9332                 if (*RExC_parse != ')')
9333                     FAIL("Sequence (?R) not terminated");
9334                 ret = reg_node(pRExC_state, GOSTART);
9335                 *flagp |= POSTPONED;
9336                 nextchar(pRExC_state);
9337                 return ret;
9338                 /*notreached*/
9339             { /* named and numeric backreferences */
9340                 I32 num;
9341             case '&':            /* (?&NAME) */
9342                 parse_start = RExC_parse - 1;
9343               named_recursion:
9344                 {
9345                     SV *sv_dat = reg_scan_name(pRExC_state,
9346                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9347                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9348                 }
9349                 goto gen_recurse_regop;
9350                 assert(0); /* NOT REACHED */
9351             case '+':
9352                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9353                     RExC_parse++;
9354                     vFAIL("Illegal pattern");
9355                 }
9356                 goto parse_recursion;
9357                 /* NOT REACHED*/
9358             case '-': /* (?-1) */
9359                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9360                     RExC_parse--; /* rewind to let it be handled later */
9361                     goto parse_flags;
9362                 }
9363                 /*FALLTHROUGH */
9364             case '1': case '2': case '3': case '4': /* (?1) */
9365             case '5': case '6': case '7': case '8': case '9':
9366                 RExC_parse--;
9367               parse_recursion:
9368                 num = atoi(RExC_parse);
9369                 parse_start = RExC_parse - 1; /* MJD */
9370                 if (*RExC_parse == '-')
9371                     RExC_parse++;
9372                 while (isDIGIT(*RExC_parse))
9373                         RExC_parse++;
9374                 if (*RExC_parse!=')')
9375                     vFAIL("Expecting close bracket");
9376
9377               gen_recurse_regop:
9378                 if ( paren == '-' ) {
9379                     /*
9380                     Diagram of capture buffer numbering.
9381                     Top line is the normal capture buffer numbers
9382                     Bottom line is the negative indexing as from
9383                     the X (the (?-2))
9384
9385                     +   1 2    3 4 5 X          6 7
9386                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9387                     -   5 4    3 2 1 X          x x
9388
9389                     */
9390                     num = RExC_npar + num;
9391                     if (num < 1)  {
9392                         RExC_parse++;
9393                         vFAIL("Reference to nonexistent group");
9394                     }
9395                 } else if ( paren == '+' ) {
9396                     num = RExC_npar + num - 1;
9397                 }
9398
9399                 ret = reganode(pRExC_state, GOSUB, num);
9400                 if (!SIZE_ONLY) {
9401                     if (num > (I32)RExC_rx->nparens) {
9402                         RExC_parse++;
9403                         vFAIL("Reference to nonexistent group");
9404                     }
9405                     ARG2L_SET( ret, RExC_recurse_count++);
9406                     RExC_emit++;
9407                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9408                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9409                 } else {
9410                     RExC_size++;
9411                 }
9412                 RExC_seen |= REG_SEEN_RECURSE;
9413                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9414                 Set_Node_Offset(ret, parse_start); /* MJD */
9415
9416                 *flagp |= POSTPONED;
9417                 nextchar(pRExC_state);
9418                 return ret;
9419             } /* named and numeric backreferences */
9420             assert(0); /* NOT REACHED */
9421
9422             case '?':           /* (??...) */
9423                 is_logical = 1;
9424                 if (*RExC_parse != '{') {
9425                     RExC_parse++;
9426                     vFAIL2utf8f(
9427                         "Sequence (%"UTF8f"...) not recognized",
9428                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9429                     /*NOTREACHED*/
9430                 }
9431                 *flagp |= POSTPONED;
9432                 paren = *RExC_parse++;
9433                 /* FALL THROUGH */
9434             case '{':           /* (?{...}) */
9435             {
9436                 U32 n = 0;
9437                 struct reg_code_block *cb;
9438
9439                 RExC_seen_zerolen++;
9440
9441                 if (   !pRExC_state->num_code_blocks
9442                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9443                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9444                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9445                             - RExC_start)
9446                 ) {
9447                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9448                         FAIL("panic: Sequence (?{...}): no code block found\n");
9449                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9450                 }
9451                 /* this is a pre-compiled code block (?{...}) */
9452                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9453                 RExC_parse = RExC_start + cb->end;
9454                 if (!SIZE_ONLY) {
9455                     OP *o = cb->block;
9456                     if (cb->src_regex) {
9457                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9458                         RExC_rxi->data->data[n] =
9459                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9460                         RExC_rxi->data->data[n+1] = (void*)o;
9461                     }
9462                     else {
9463                         n = add_data(pRExC_state,
9464                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9465                         RExC_rxi->data->data[n] = (void*)o;
9466                     }
9467                 }
9468                 pRExC_state->code_index++;
9469                 nextchar(pRExC_state);
9470
9471                 if (is_logical) {
9472                     regnode *eval;
9473                     ret = reg_node(pRExC_state, LOGICAL);
9474                     eval = reganode(pRExC_state, EVAL, n);
9475                     if (!SIZE_ONLY) {
9476                         ret->flags = 2;
9477                         /* for later propagation into (??{}) return value */
9478                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9479                     }
9480                     REGTAIL(pRExC_state, ret, eval);
9481                     /* deal with the length of this later - MJD */
9482                     return ret;
9483                 }
9484                 ret = reganode(pRExC_state, EVAL, n);
9485                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9486                 Set_Node_Offset(ret, parse_start);
9487                 return ret;
9488             }
9489             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9490             {
9491                 int is_define= 0;
9492                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9493                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9494                         || RExC_parse[1] == '<'
9495                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9496                         I32 flag;
9497                         regnode *tail;
9498
9499                         ret = reg_node(pRExC_state, LOGICAL);
9500                         if (!SIZE_ONLY)
9501                             ret->flags = 1;
9502
9503                         tail = reg(pRExC_state, 1, &flag, depth+1);
9504                         if (flag & RESTART_UTF8) {
9505                             *flagp = RESTART_UTF8;
9506                             return NULL;
9507                         }
9508                         REGTAIL(pRExC_state, ret, tail);
9509                         goto insert_if;
9510                     }
9511                 }
9512                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9513                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9514                 {
9515                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9516                     char *name_start= RExC_parse++;
9517                     U32 num = 0;
9518                     SV *sv_dat=reg_scan_name(pRExC_state,
9519                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9520                     if (RExC_parse == name_start || *RExC_parse != ch)
9521                         vFAIL2("Sequence (?(%c... not terminated",
9522                             (ch == '>' ? '<' : ch));
9523                     RExC_parse++;
9524                     if (!SIZE_ONLY) {
9525                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9526                         RExC_rxi->data->data[num]=(void*)sv_dat;
9527                         SvREFCNT_inc_simple_void(sv_dat);
9528                     }
9529                     ret = reganode(pRExC_state,NGROUPP,num);
9530                     goto insert_if_check_paren;
9531                 }
9532                 else if (RExC_parse[0] == 'D' &&
9533                          RExC_parse[1] == 'E' &&
9534                          RExC_parse[2] == 'F' &&
9535                          RExC_parse[3] == 'I' &&
9536                          RExC_parse[4] == 'N' &&
9537                          RExC_parse[5] == 'E')
9538                 {
9539                     ret = reganode(pRExC_state,DEFINEP,0);
9540                     RExC_parse +=6 ;
9541                     is_define = 1;
9542                     goto insert_if_check_paren;
9543                 }
9544                 else if (RExC_parse[0] == 'R') {
9545                     RExC_parse++;
9546                     parno = 0;
9547                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9548                         parno = atoi(RExC_parse++);
9549                         while (isDIGIT(*RExC_parse))
9550                             RExC_parse++;
9551                     } else if (RExC_parse[0] == '&') {
9552                         SV *sv_dat;
9553                         RExC_parse++;
9554                         sv_dat = reg_scan_name(pRExC_state,
9555                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9556                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9557                     }
9558                     ret = reganode(pRExC_state,INSUBP,parno);
9559                     goto insert_if_check_paren;
9560                 }
9561                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9562                     /* (?(1)...) */
9563                     char c;
9564                     char *tmp;
9565                     parno = atoi(RExC_parse++);
9566
9567                     while (isDIGIT(*RExC_parse))
9568                         RExC_parse++;
9569                     ret = reganode(pRExC_state, GROUPP, parno);
9570
9571                  insert_if_check_paren:
9572                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9573                         if ( UTF ) {
9574                         /* Like the name implies, nextchar deals in chars,
9575                          * not characters, so if under UTF, undo its work
9576                          * and skip over the the next character.
9577                          */
9578                             RExC_parse = tmp;
9579                             RExC_parse += UTF8SKIP(RExC_parse);
9580                         }
9581                         vFAIL("Switch condition not recognized");
9582                     }
9583                   insert_if:
9584                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9585                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9586                     if (br == NULL) {
9587                         if (flags & RESTART_UTF8) {
9588                             *flagp = RESTART_UTF8;
9589                             return NULL;
9590                         }
9591                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9592                               (UV) flags);
9593                     } else
9594                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9595                     c = *nextchar(pRExC_state);
9596                     if (flags&HASWIDTH)
9597                         *flagp |= HASWIDTH;
9598                     if (c == '|') {
9599                         if (is_define)
9600                             vFAIL("(?(DEFINE)....) does not allow branches");
9601                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9602                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9603                             if (flags & RESTART_UTF8) {
9604                                 *flagp = RESTART_UTF8;
9605                                 return NULL;
9606                             }
9607                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9608                                   (UV) flags);
9609                         }
9610                         REGTAIL(pRExC_state, ret, lastbr);
9611                         if (flags&HASWIDTH)
9612                             *flagp |= HASWIDTH;
9613                         c = *nextchar(pRExC_state);
9614                     }
9615                     else
9616                         lastbr = NULL;
9617                     if (c != ')')
9618                         vFAIL("Switch (?(condition)... contains too many branches");
9619                     ender = reg_node(pRExC_state, TAIL);
9620                     REGTAIL(pRExC_state, br, ender);
9621                     if (lastbr) {
9622                         REGTAIL(pRExC_state, lastbr, ender);
9623                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9624                     }
9625                     else
9626                         REGTAIL(pRExC_state, ret, ender);
9627                     RExC_size++; /* XXX WHY do we need this?!!
9628                                     For large programs it seems to be required
9629                                     but I can't figure out why. -- dmq*/
9630                     return ret;
9631                 }
9632                 else {
9633                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9634                     vFAIL("Unknown switch condition (?(...))");
9635                 }
9636             }
9637             case '[':           /* (?[ ... ]) */
9638                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9639                                          oregcomp_parse);
9640             case 0:
9641                 RExC_parse--; /* for vFAIL to print correctly */
9642                 vFAIL("Sequence (? incomplete");
9643                 break;
9644             default: /* e.g., (?i) */
9645                 --RExC_parse;
9646               parse_flags:
9647                 parse_lparen_question_flags(pRExC_state);
9648                 if (UCHARAT(RExC_parse) != ':') {
9649                     nextchar(pRExC_state);
9650                     *flagp = TRYAGAIN;
9651                     return NULL;
9652                 }
9653                 paren = ':';
9654                 nextchar(pRExC_state);
9655                 ret = NULL;
9656                 goto parse_rest;
9657             } /* end switch */
9658         }
9659         else {                  /* (...) */
9660           capturing_parens:
9661             parno = RExC_npar;
9662             RExC_npar++;
9663
9664             ret = reganode(pRExC_state, OPEN, parno);
9665             if (!SIZE_ONLY ){
9666                 if (!RExC_nestroot)
9667                     RExC_nestroot = parno;
9668                 if (RExC_seen & REG_SEEN_RECURSE
9669                     && !RExC_open_parens[parno-1])
9670                 {
9671                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9672                         "Setting open paren #%"IVdf" to %d\n",
9673                         (IV)parno, REG_NODE_NUM(ret)));
9674                     RExC_open_parens[parno-1]= ret;
9675                 }
9676             }
9677             Set_Node_Length(ret, 1); /* MJD */
9678             Set_Node_Offset(ret, RExC_parse); /* MJD */
9679             is_open = 1;
9680         }
9681     }
9682     else                        /* ! paren */
9683         ret = NULL;
9684
9685    parse_rest:
9686     /* Pick up the branches, linking them together. */
9687     parse_start = RExC_parse;   /* MJD */
9688     br = regbranch(pRExC_state, &flags, 1,depth+1);
9689
9690     /*     branch_len = (paren != 0); */
9691
9692     if (br == NULL) {
9693         if (flags & RESTART_UTF8) {
9694             *flagp = RESTART_UTF8;
9695             return NULL;
9696         }
9697         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9698     }
9699     if (*RExC_parse == '|') {
9700         if (!SIZE_ONLY && RExC_extralen) {
9701             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9702         }
9703         else {                  /* MJD */
9704             reginsert(pRExC_state, BRANCH, br, depth+1);
9705             Set_Node_Length(br, paren != 0);
9706             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9707         }
9708         have_branch = 1;
9709         if (SIZE_ONLY)
9710             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9711     }
9712     else if (paren == ':') {
9713         *flagp |= flags&SIMPLE;
9714     }
9715     if (is_open) {                              /* Starts with OPEN. */
9716         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9717     }
9718     else if (paren != '?')              /* Not Conditional */
9719         ret = br;
9720     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9721     lastbr = br;
9722     while (*RExC_parse == '|') {
9723         if (!SIZE_ONLY && RExC_extralen) {
9724             ender = reganode(pRExC_state, LONGJMP,0);
9725             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9726         }
9727         if (SIZE_ONLY)
9728             RExC_extralen += 2;         /* Account for LONGJMP. */
9729         nextchar(pRExC_state);
9730         if (freeze_paren) {
9731             if (RExC_npar > after_freeze)
9732                 after_freeze = RExC_npar;
9733             RExC_npar = freeze_paren;
9734         }
9735         br = regbranch(pRExC_state, &flags, 0, depth+1);
9736
9737         if (br == NULL) {
9738             if (flags & RESTART_UTF8) {
9739                 *flagp = RESTART_UTF8;
9740                 return NULL;
9741             }
9742             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9743         }
9744         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9745         lastbr = br;
9746         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9747     }
9748
9749     if (have_branch || paren != ':') {
9750         /* Make a closing node, and hook it on the end. */
9751         switch (paren) {
9752         case ':':
9753             ender = reg_node(pRExC_state, TAIL);
9754             break;
9755         case 1: case 2:
9756             ender = reganode(pRExC_state, CLOSE, parno);
9757             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9758                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9759                         "Setting close paren #%"IVdf" to %d\n",
9760                         (IV)parno, REG_NODE_NUM(ender)));
9761                 RExC_close_parens[parno-1]= ender;
9762                 if (RExC_nestroot == parno)
9763                     RExC_nestroot = 0;
9764             }
9765             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9766             Set_Node_Length(ender,1); /* MJD */
9767             break;
9768         case '<':
9769         case ',':
9770         case '=':
9771         case '!':
9772             *flagp &= ~HASWIDTH;
9773             /* FALL THROUGH */
9774         case '>':
9775             ender = reg_node(pRExC_state, SUCCEED);
9776             break;
9777         case 0:
9778             ender = reg_node(pRExC_state, END);
9779             if (!SIZE_ONLY) {
9780                 assert(!RExC_opend); /* there can only be one! */
9781                 RExC_opend = ender;
9782             }
9783             break;
9784         }
9785         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9786             SV * const mysv_val1=sv_newmortal();
9787             SV * const mysv_val2=sv_newmortal();
9788             DEBUG_PARSE_MSG("lsbr");
9789             regprop(RExC_rx, mysv_val1, lastbr);
9790             regprop(RExC_rx, mysv_val2, ender);
9791             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9792                           SvPV_nolen_const(mysv_val1),
9793                           (IV)REG_NODE_NUM(lastbr),
9794                           SvPV_nolen_const(mysv_val2),
9795                           (IV)REG_NODE_NUM(ender),
9796                           (IV)(ender - lastbr)
9797             );
9798         });
9799         REGTAIL(pRExC_state, lastbr, ender);
9800
9801         if (have_branch && !SIZE_ONLY) {
9802             char is_nothing= 1;
9803             if (depth==1)
9804                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9805
9806             /* Hook the tails of the branches to the closing node. */
9807             for (br = ret; br; br = regnext(br)) {
9808                 const U8 op = PL_regkind[OP(br)];
9809                 if (op == BRANCH) {
9810                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9811                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9812                         is_nothing= 0;
9813                 }
9814                 else if (op == BRANCHJ) {
9815                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9816                     /* for now we always disable this optimisation * /
9817                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9818                     */
9819                         is_nothing= 0;
9820                 }
9821             }
9822             if (is_nothing) {
9823                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9824                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9825                     SV * const mysv_val1=sv_newmortal();
9826                     SV * const mysv_val2=sv_newmortal();
9827                     DEBUG_PARSE_MSG("NADA");
9828                     regprop(RExC_rx, mysv_val1, ret);
9829                     regprop(RExC_rx, mysv_val2, ender);
9830                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9831                                   SvPV_nolen_const(mysv_val1),
9832                                   (IV)REG_NODE_NUM(ret),
9833                                   SvPV_nolen_const(mysv_val2),
9834                                   (IV)REG_NODE_NUM(ender),
9835                                   (IV)(ender - ret)
9836                     );
9837                 });
9838                 OP(br)= NOTHING;
9839                 if (OP(ender) == TAIL) {
9840                     NEXT_OFF(br)= 0;
9841                     RExC_emit= br + 1;
9842                 } else {
9843                     regnode *opt;
9844                     for ( opt= br + 1; opt < ender ; opt++ )
9845                         OP(opt)= OPTIMIZED;
9846                     NEXT_OFF(br)= ender - br;
9847                 }
9848             }
9849         }
9850     }
9851
9852     {
9853         const char *p;
9854         static const char parens[] = "=!<,>";
9855
9856         if (paren && (p = strchr(parens, paren))) {
9857             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9858             int flag = (p - parens) > 1;
9859
9860             if (paren == '>')
9861                 node = SUSPEND, flag = 0;
9862             reginsert(pRExC_state, node,ret, depth+1);
9863             Set_Node_Cur_Length(ret, parse_start);
9864             Set_Node_Offset(ret, parse_start + 1);
9865             ret->flags = flag;
9866             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9867         }
9868     }
9869
9870     /* Check for proper termination. */
9871     if (paren) {
9872         /* restore original flags, but keep (?p) */
9873         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9874         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9875             RExC_parse = oregcomp_parse;
9876             vFAIL("Unmatched (");
9877         }
9878     }
9879     else if (!paren && RExC_parse < RExC_end) {
9880         if (*RExC_parse == ')') {
9881             RExC_parse++;
9882             vFAIL("Unmatched )");
9883         }
9884         else
9885             FAIL("Junk on end of regexp");      /* "Can't happen". */
9886         assert(0); /* NOTREACHED */
9887     }
9888
9889     if (RExC_in_lookbehind) {
9890         RExC_in_lookbehind--;
9891     }
9892     if (after_freeze > RExC_npar)
9893         RExC_npar = after_freeze;
9894     return(ret);
9895 }
9896
9897 /*
9898  - regbranch - one alternative of an | operator
9899  *
9900  * Implements the concatenation operator.
9901  *
9902  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9903  * restarted.
9904  */
9905 STATIC regnode *
9906 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9907 {
9908     dVAR;
9909     regnode *ret;
9910     regnode *chain = NULL;
9911     regnode *latest;
9912     I32 flags = 0, c = 0;
9913     GET_RE_DEBUG_FLAGS_DECL;
9914
9915     PERL_ARGS_ASSERT_REGBRANCH;
9916
9917     DEBUG_PARSE("brnc");
9918
9919     if (first)
9920         ret = NULL;
9921     else {
9922         if (!SIZE_ONLY && RExC_extralen)
9923             ret = reganode(pRExC_state, BRANCHJ,0);
9924         else {
9925             ret = reg_node(pRExC_state, BRANCH);
9926             Set_Node_Length(ret, 1);
9927         }
9928     }
9929
9930     if (!first && SIZE_ONLY)
9931         RExC_extralen += 1;                     /* BRANCHJ */
9932
9933     *flagp = WORST;                     /* Tentatively. */
9934
9935     RExC_parse--;
9936     nextchar(pRExC_state);
9937     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9938         flags &= ~TRYAGAIN;
9939         latest = regpiece(pRExC_state, &flags,depth+1);
9940         if (latest == NULL) {
9941             if (flags & TRYAGAIN)
9942                 continue;
9943             if (flags & RESTART_UTF8) {
9944                 *flagp = RESTART_UTF8;
9945                 return NULL;
9946             }
9947             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9948         }
9949         else if (ret == NULL)
9950             ret = latest;
9951         *flagp |= flags&(HASWIDTH|POSTPONED);
9952         if (chain == NULL)      /* First piece. */
9953             *flagp |= flags&SPSTART;
9954         else {
9955             RExC_naughty++;
9956             REGTAIL(pRExC_state, chain, latest);
9957         }
9958         chain = latest;
9959         c++;
9960     }
9961     if (chain == NULL) {        /* Loop ran zero times. */
9962         chain = reg_node(pRExC_state, NOTHING);
9963         if (ret == NULL)
9964             ret = chain;
9965     }
9966     if (c == 1) {
9967         *flagp |= flags&SIMPLE;
9968     }
9969
9970     return ret;
9971 }
9972
9973 /*
9974  - regpiece - something followed by possible [*+?]
9975  *
9976  * Note that the branching code sequences used for ? and the general cases
9977  * of * and + are somewhat optimized:  they use the same NOTHING node as
9978  * both the endmarker for their branch list and the body of the last branch.
9979  * It might seem that this node could be dispensed with entirely, but the
9980  * endmarker role is not redundant.
9981  *
9982  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9983  * TRYAGAIN.
9984  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9985  * restarted.
9986  */
9987 STATIC regnode *
9988 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9989 {
9990     dVAR;
9991     regnode *ret;
9992     char op;
9993     char *next;
9994     I32 flags;
9995     const char * const origparse = RExC_parse;
9996     I32 min;
9997     I32 max = REG_INFTY;
9998 #ifdef RE_TRACK_PATTERN_OFFSETS
9999     char *parse_start;
10000 #endif
10001     const char *maxpos = NULL;
10002
10003     /* Save the original in case we change the emitted regop to a FAIL. */
10004     regnode * const orig_emit = RExC_emit;
10005
10006     GET_RE_DEBUG_FLAGS_DECL;
10007
10008     PERL_ARGS_ASSERT_REGPIECE;
10009
10010     DEBUG_PARSE("piec");
10011
10012     ret = regatom(pRExC_state, &flags,depth+1);
10013     if (ret == NULL) {
10014         if (flags & (TRYAGAIN|RESTART_UTF8))
10015             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10016         else
10017             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10018         return(NULL);
10019     }
10020
10021     op = *RExC_parse;
10022
10023     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10024         maxpos = NULL;
10025 #ifdef RE_TRACK_PATTERN_OFFSETS
10026         parse_start = RExC_parse; /* MJD */
10027 #endif
10028         next = RExC_parse + 1;
10029         while (isDIGIT(*next) || *next == ',') {
10030             if (*next == ',') {
10031                 if (maxpos)
10032                     break;
10033                 else
10034                     maxpos = next;
10035             }
10036             next++;
10037         }
10038         if (*next == '}') {             /* got one */
10039             if (!maxpos)
10040                 maxpos = next;
10041             RExC_parse++;
10042             min = atoi(RExC_parse);
10043             if (*maxpos == ',')
10044                 maxpos++;
10045             else
10046                 maxpos = RExC_parse;
10047             max = atoi(maxpos);
10048             if (!max && *maxpos != '0')
10049                 max = REG_INFTY;                /* meaning "infinity" */
10050             else if (max >= REG_INFTY)
10051                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10052             RExC_parse = next;
10053             nextchar(pRExC_state);
10054             if (max < min) {    /* If can't match, warn and optimize to fail
10055                                    unconditionally */
10056                 if (SIZE_ONLY) {
10057                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10058
10059                     /* We can't back off the size because we have to reserve
10060                      * enough space for all the things we are about to throw
10061                      * away, but we can shrink it by the ammount we are about
10062                      * to re-use here */
10063                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10064                 }
10065                 else {
10066                     RExC_emit = orig_emit;
10067                 }
10068                 ret = reg_node(pRExC_state, OPFAIL);
10069                 return ret;
10070             }
10071
10072         do_curly:
10073             if ((flags&SIMPLE)) {
10074                 RExC_naughty += 2 + RExC_naughty / 2;
10075                 reginsert(pRExC_state, CURLY, ret, depth+1);
10076                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10077                 Set_Node_Cur_Length(ret, parse_start);
10078             }
10079             else {
10080                 regnode * const w = reg_node(pRExC_state, WHILEM);
10081
10082                 w->flags = 0;
10083                 REGTAIL(pRExC_state, ret, w);
10084                 if (!SIZE_ONLY && RExC_extralen) {
10085                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10086                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10087                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10088                 }
10089                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10090                                 /* MJD hk */
10091                 Set_Node_Offset(ret, parse_start+1);
10092                 Set_Node_Length(ret,
10093                                 op == '{' ? (RExC_parse - parse_start) : 1);
10094
10095                 if (!SIZE_ONLY && RExC_extralen)
10096                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10097                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10098                 if (SIZE_ONLY)
10099                     RExC_whilem_seen++, RExC_extralen += 3;
10100                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10101             }
10102             ret->flags = 0;
10103
10104             if (min > 0)
10105                 *flagp = WORST;
10106             if (max > 0)
10107                 *flagp |= HASWIDTH;
10108             if (!SIZE_ONLY) {
10109                 ARG1_SET(ret, (U16)min);
10110                 ARG2_SET(ret, (U16)max);
10111             }
10112
10113             goto nest_check;
10114         }
10115     }
10116
10117     if (!ISMULT1(op)) {
10118         *flagp = flags;
10119         return(ret);
10120     }
10121
10122 #if 0                           /* Now runtime fix should be reliable. */
10123
10124     /* if this is reinstated, don't forget to put this back into perldiag:
10125
10126             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10127
10128            (F) The part of the regexp subject to either the * or + quantifier
10129            could match an empty string. The {#} shows in the regular
10130            expression about where the problem was discovered.
10131
10132     */
10133
10134     if (!(flags&HASWIDTH) && op != '?')
10135       vFAIL("Regexp *+ operand could be empty");
10136 #endif
10137
10138 #ifdef RE_TRACK_PATTERN_OFFSETS
10139     parse_start = RExC_parse;
10140 #endif
10141     nextchar(pRExC_state);
10142
10143     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10144
10145     if (op == '*' && (flags&SIMPLE)) {
10146         reginsert(pRExC_state, STAR, ret, depth+1);
10147         ret->flags = 0;
10148         RExC_naughty += 4;
10149     }
10150     else if (op == '*') {
10151         min = 0;
10152         goto do_curly;
10153     }
10154     else if (op == '+' && (flags&SIMPLE)) {
10155         reginsert(pRExC_state, PLUS, ret, depth+1);
10156         ret->flags = 0;
10157         RExC_naughty += 3;
10158     }
10159     else if (op == '+') {
10160         min = 1;
10161         goto do_curly;
10162     }
10163     else if (op == '?') {
10164         min = 0; max = 1;
10165         goto do_curly;
10166     }
10167   nest_check:
10168     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10169         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10170         ckWARN2reg(RExC_parse,
10171                    "%"UTF8f" matches null string many times",
10172                    UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
10173                    origparse));
10174         (void)ReREFCNT_inc(RExC_rx_sv);
10175     }
10176
10177     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10178         nextchar(pRExC_state);
10179         reginsert(pRExC_state, MINMOD, ret, depth+1);
10180         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10181     }
10182     else
10183     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10184         regnode *ender;
10185         nextchar(pRExC_state);
10186         ender = reg_node(pRExC_state, SUCCEED);
10187         REGTAIL(pRExC_state, ret, ender);
10188         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10189         ret->flags = 0;
10190         ender = reg_node(pRExC_state, TAIL);
10191         REGTAIL(pRExC_state, ret, ender);
10192     }
10193
10194     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10195         RExC_parse++;
10196         vFAIL("Nested quantifiers");
10197     }
10198
10199     return(ret);
10200 }
10201
10202 STATIC bool
10203 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10204         const bool strict   /* Apply stricter parsing rules? */
10205     )
10206 {
10207
10208  /* This is expected to be called by a parser routine that has recognized '\N'
10209    and needs to handle the rest. RExC_parse is expected to point at the first
10210    char following the N at the time of the call.  On successful return,
10211    RExC_parse has been updated to point to just after the sequence identified
10212    by this routine, and <*flagp> has been updated.
10213
10214    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10215    character class.
10216
10217    \N may begin either a named sequence, or if outside a character class, mean
10218    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10219    attempted to decide which, and in the case of a named sequence, converted it
10220    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10221    where c1... are the characters in the sequence.  For single-quoted regexes,
10222    the tokenizer passes the \N sequence through unchanged; this code will not
10223    attempt to determine this nor expand those, instead raising a syntax error.
10224    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10225    or there is no '}', it signals that this \N occurrence means to match a
10226    non-newline.
10227
10228    Only the \N{U+...} form should occur in a character class, for the same
10229    reason that '.' inside a character class means to just match a period: it
10230    just doesn't make sense.
10231
10232    The function raises an error (via vFAIL), and doesn't return for various
10233    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10234    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10235    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10236    only possible if node_p is non-NULL.
10237
10238
10239    If <valuep> is non-null, it means the caller can accept an input sequence
10240    consisting of a just a single code point; <*valuep> is set to that value
10241    if the input is such.
10242
10243    If <node_p> is non-null it signifies that the caller can accept any other
10244    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10245    is set as follows:
10246     1) \N means not-a-NL: points to a newly created REG_ANY node;
10247     2) \N{}:              points to a new NOTHING node;
10248     3) otherwise:         points to a new EXACT node containing the resolved
10249                           string.
10250    Note that FALSE is returned for single code point sequences if <valuep> is
10251    null.
10252  */
10253
10254     char * endbrace;    /* '}' following the name */
10255     char* p;
10256     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10257                            stream */
10258     bool has_multiple_chars; /* true if the input stream contains a sequence of
10259                                 more than one character */
10260
10261     GET_RE_DEBUG_FLAGS_DECL;
10262
10263     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10264
10265     GET_RE_DEBUG_FLAGS;
10266
10267     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10268
10269     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10270      * modifier.  The other meaning does not */
10271     p = (RExC_flags & RXf_PMf_EXTENDED)
10272         ? regwhite( pRExC_state, RExC_parse )
10273         : RExC_parse;
10274
10275     /* Disambiguate between \N meaning a named character versus \N meaning
10276      * [^\n].  The former is assumed when it can't be the latter. */
10277     if (*p != '{' || regcurly(p, FALSE)) {
10278         RExC_parse = p;
10279         if (! node_p) {
10280             /* no bare \N in a charclass */
10281             if (in_char_class) {
10282                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10283             }
10284             return FALSE;
10285         }
10286         nextchar(pRExC_state);
10287         *node_p = reg_node(pRExC_state, REG_ANY);
10288         *flagp |= HASWIDTH|SIMPLE;
10289         RExC_naughty++;
10290         RExC_parse--;
10291         Set_Node_Length(*node_p, 1); /* MJD */
10292         return TRUE;
10293     }
10294
10295     /* Here, we have decided it should be a named character or sequence */
10296
10297     /* The test above made sure that the next real character is a '{', but
10298      * under the /x modifier, it could be separated by space (or a comment and
10299      * \n) and this is not allowed (for consistency with \x{...} and the
10300      * tokenizer handling of \N{NAME}). */
10301     if (*RExC_parse != '{') {
10302         vFAIL("Missing braces on \\N{}");
10303     }
10304
10305     RExC_parse++;       /* Skip past the '{' */
10306
10307     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10308         || ! (endbrace == RExC_parse            /* nothing between the {} */
10309               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
10310                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
10311     {
10312         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10313         vFAIL("\\N{NAME} must be resolved by the lexer");
10314     }
10315
10316     if (endbrace == RExC_parse) {   /* empty: \N{} */
10317         bool ret = TRUE;
10318         if (node_p) {
10319             *node_p = reg_node(pRExC_state,NOTHING);
10320         }
10321         else if (in_char_class) {
10322             if (SIZE_ONLY && in_char_class) {
10323                 if (strict) {
10324                     RExC_parse++;   /* Position after the "}" */
10325                     vFAIL("Zero length \\N{}");
10326                 }
10327                 else {
10328                     ckWARNreg(RExC_parse,
10329                               "Ignoring zero length \\N{} in character class");
10330                 }
10331             }
10332             ret = FALSE;
10333         }
10334         else {
10335             return FALSE;
10336         }
10337         nextchar(pRExC_state);
10338         return ret;
10339     }
10340
10341     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10342     RExC_parse += 2;    /* Skip past the 'U+' */
10343
10344     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10345
10346     /* Code points are separated by dots.  If none, there is only one code
10347      * point, and is terminated by the brace */
10348     has_multiple_chars = (endchar < endbrace);
10349
10350     if (valuep && (! has_multiple_chars || in_char_class)) {
10351         /* We only pay attention to the first char of
10352         multichar strings being returned in char classes. I kinda wonder
10353         if this makes sense as it does change the behaviour
10354         from earlier versions, OTOH that behaviour was broken
10355         as well. XXX Solution is to recharacterize as
10356         [rest-of-class]|multi1|multi2... */
10357
10358         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10359         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10360             | PERL_SCAN_DISALLOW_PREFIX
10361             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10362
10363         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10364
10365         /* The tokenizer should have guaranteed validity, but it's possible to
10366          * bypass it by using single quoting, so check */
10367         if (length_of_hex == 0
10368             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10369         {
10370             RExC_parse += length_of_hex;        /* Includes all the valid */
10371             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10372                             ? UTF8SKIP(RExC_parse)
10373                             : 1;
10374             /* Guard against malformed utf8 */
10375             if (RExC_parse >= endchar) {
10376                 RExC_parse = endchar;
10377             }
10378             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10379         }
10380
10381         if (in_char_class && has_multiple_chars) {
10382             if (strict) {
10383                 RExC_parse = endbrace;
10384                 vFAIL("\\N{} in character class restricted to one character");
10385             }
10386             else {
10387                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10388             }
10389         }
10390
10391         RExC_parse = endbrace + 1;
10392     }
10393     else if (! node_p || ! has_multiple_chars) {
10394
10395         /* Here, the input is legal, but not according to the caller's
10396          * options.  We fail without advancing the parse, so that the
10397          * caller can try again */
10398         RExC_parse = p;
10399         return FALSE;
10400     }
10401     else {
10402
10403         /* What is done here is to convert this to a sub-pattern of the form
10404          * (?:\x{char1}\x{char2}...)
10405          * and then call reg recursively.  That way, it retains its atomicness,
10406          * while not having to worry about special handling that some code
10407          * points may have.  toke.c has converted the original Unicode values
10408          * to native, so that we can just pass on the hex values unchanged.  We
10409          * do have to set a flag to keep recoding from happening in the
10410          * recursion */
10411
10412         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10413         STRLEN len;
10414         char *orig_end = RExC_end;
10415         I32 flags;
10416
10417         while (RExC_parse < endbrace) {
10418
10419             /* Convert to notation the rest of the code understands */
10420             sv_catpv(substitute_parse, "\\x{");
10421             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10422             sv_catpv(substitute_parse, "}");
10423
10424             /* Point to the beginning of the next character in the sequence. */
10425             RExC_parse = endchar + 1;
10426             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10427         }
10428         sv_catpv(substitute_parse, ")");
10429
10430         RExC_parse = SvPV(substitute_parse, len);
10431
10432         /* Don't allow empty number */
10433         if (len < 8) {
10434             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10435         }
10436         RExC_end = RExC_parse + len;
10437
10438         /* The values are Unicode, and therefore not subject to recoding */
10439         RExC_override_recoding = 1;
10440
10441         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10442             if (flags & RESTART_UTF8) {
10443                 *flagp = RESTART_UTF8;
10444                 return FALSE;
10445             }
10446             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10447                   (UV) flags);
10448         }
10449         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10450
10451         RExC_parse = endbrace;
10452         RExC_end = orig_end;
10453         RExC_override_recoding = 0;
10454
10455         nextchar(pRExC_state);
10456     }
10457
10458     return TRUE;
10459 }
10460
10461
10462 /*
10463  * reg_recode
10464  *
10465  * It returns the code point in utf8 for the value in *encp.
10466  *    value: a code value in the source encoding
10467  *    encp:  a pointer to an Encode object
10468  *
10469  * If the result from Encode is not a single character,
10470  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10471  */
10472 STATIC UV
10473 S_reg_recode(pTHX_ const char value, SV **encp)
10474 {
10475     STRLEN numlen = 1;
10476     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10477     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10478     const STRLEN newlen = SvCUR(sv);
10479     UV uv = UNICODE_REPLACEMENT;
10480
10481     PERL_ARGS_ASSERT_REG_RECODE;
10482
10483     if (newlen)
10484         uv = SvUTF8(sv)
10485              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10486              : *(U8*)s;
10487
10488     if (!newlen || numlen != newlen) {
10489         uv = UNICODE_REPLACEMENT;
10490         *encp = NULL;
10491     }
10492     return uv;
10493 }
10494
10495 PERL_STATIC_INLINE U8
10496 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10497 {
10498     U8 op;
10499
10500     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10501
10502     if (! FOLD) {
10503         return EXACT;
10504     }
10505
10506     op = get_regex_charset(RExC_flags);
10507     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10508         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10509                  been, so there is no hole */
10510     }
10511
10512     return op + EXACTF;
10513 }
10514
10515 PERL_STATIC_INLINE void
10516 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10517 {
10518     /* This knows the details about sizing an EXACTish node, setting flags for
10519      * it (by setting <*flagp>, and potentially populating it with a single
10520      * character.
10521      *
10522      * If <len> (the length in bytes) is non-zero, this function assumes that
10523      * the node has already been populated, and just does the sizing.  In this
10524      * case <code_point> should be the final code point that has already been
10525      * placed into the node.  This value will be ignored except that under some
10526      * circumstances <*flagp> is set based on it.
10527      *
10528      * If <len> is zero, the function assumes that the node is to contain only
10529      * the single character given by <code_point> and calculates what <len>
10530      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10531      * additionally will populate the node's STRING with <code_point>, if <len>
10532      * is 0.  In both cases <*flagp> is appropriately set
10533      *
10534      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10535      * 255, must be folded (the former only when the rules indicate it can
10536      * match 'ss') */
10537
10538     bool len_passed_in = cBOOL(len != 0);
10539     U8 character[UTF8_MAXBYTES_CASE+1];
10540
10541     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10542
10543     if (! len_passed_in) {
10544         if (UTF) {
10545             if (FOLD && (! LOC || code_point > 255)) {
10546                 _to_uni_fold_flags(code_point,
10547                                    character,
10548                                    &len,
10549                                    FOLD_FLAGS_FULL | ((LOC)
10550                                                      ? FOLD_FLAGS_LOCALE
10551                                                      : (ASCII_FOLD_RESTRICTED)
10552                                                        ? FOLD_FLAGS_NOMIX_ASCII
10553                                                        : 0));
10554             }
10555             else {
10556                 uvchr_to_utf8( character, code_point);
10557                 len = UTF8SKIP(character);
10558             }
10559         }
10560         else if (! FOLD
10561                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10562                  || ASCII_FOLD_RESTRICTED
10563                  || ! AT_LEAST_UNI_SEMANTICS)
10564         {
10565             *character = (U8) code_point;
10566             len = 1;
10567         }
10568         else {
10569             *character = 's';
10570             *(character + 1) = 's';
10571             len = 2;
10572         }
10573     }
10574
10575     if (SIZE_ONLY) {
10576         RExC_size += STR_SZ(len);
10577     }
10578     else {
10579         RExC_emit += STR_SZ(len);
10580         STR_LEN(node) = len;
10581         if (! len_passed_in) {
10582             Copy((char *) character, STRING(node), len, char);
10583         }
10584     }
10585
10586     *flagp |= HASWIDTH;
10587
10588     /* A single character node is SIMPLE, except for the special-cased SHARP S
10589      * under /di. */
10590     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10591         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10592             || ! FOLD || ! DEPENDS_SEMANTICS))
10593     {
10594         *flagp |= SIMPLE;
10595     }
10596 }
10597
10598
10599 /* return atoi(p), unless it's too big to sensibly be a backref,
10600  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
10601
10602 static I32
10603 S_backref_value(char *p)
10604 {
10605     char *q = p;
10606
10607     for (;isDIGIT(*q); q++); /* calculate length of num */
10608     if (q - p == 0 || q - p > 9)
10609         return I32_MAX;
10610     return atoi(p);
10611 }
10612
10613
10614 /*
10615  - regatom - the lowest level
10616
10617    Try to identify anything special at the start of the pattern. If there
10618    is, then handle it as required. This may involve generating a single regop,
10619    such as for an assertion; or it may involve recursing, such as to
10620    handle a () structure.
10621
10622    If the string doesn't start with something special then we gobble up
10623    as much literal text as we can.
10624
10625    Once we have been able to handle whatever type of thing started the
10626    sequence, we return.
10627
10628    Note: we have to be careful with escapes, as they can be both literal
10629    and special, and in the case of \10 and friends, context determines which.
10630
10631    A summary of the code structure is:
10632
10633    switch (first_byte) {
10634         cases for each special:
10635             handle this special;
10636             break;
10637         case '\\':
10638             switch (2nd byte) {
10639                 cases for each unambiguous special:
10640                     handle this special;
10641                     break;
10642                 cases for each ambigous special/literal:
10643                     disambiguate;
10644                     if (special)  handle here
10645                     else goto defchar;
10646                 default: // unambiguously literal:
10647                     goto defchar;
10648             }
10649         default:  // is a literal char
10650             // FALL THROUGH
10651         defchar:
10652             create EXACTish node for literal;
10653             while (more input and node isn't full) {
10654                 switch (input_byte) {
10655                    cases for each special;
10656                        make sure parse pointer is set so that the next call to
10657                            regatom will see this special first
10658                        goto loopdone; // EXACTish node terminated by prev. char
10659                    default:
10660                        append char to EXACTISH node;
10661                 }
10662                 get next input byte;
10663             }
10664         loopdone:
10665    }
10666    return the generated node;
10667
10668    Specifically there are two separate switches for handling
10669    escape sequences, with the one for handling literal escapes requiring
10670    a dummy entry for all of the special escapes that are actually handled
10671    by the other.
10672
10673    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10674    TRYAGAIN.
10675    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10676    restarted.
10677    Otherwise does not return NULL.
10678 */
10679
10680 STATIC regnode *
10681 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10682 {
10683     dVAR;
10684     regnode *ret = NULL;
10685     I32 flags = 0;
10686     char *parse_start = RExC_parse;
10687     U8 op;
10688     int invert = 0;
10689
10690     GET_RE_DEBUG_FLAGS_DECL;
10691
10692     *flagp = WORST;             /* Tentatively. */
10693
10694     DEBUG_PARSE("atom");
10695
10696     PERL_ARGS_ASSERT_REGATOM;
10697
10698 tryagain:
10699     switch ((U8)*RExC_parse) {
10700     case '^':
10701         RExC_seen_zerolen++;
10702         nextchar(pRExC_state);
10703         if (RExC_flags & RXf_PMf_MULTILINE)
10704             ret = reg_node(pRExC_state, MBOL);
10705         else if (RExC_flags & RXf_PMf_SINGLELINE)
10706             ret = reg_node(pRExC_state, SBOL);
10707         else
10708             ret = reg_node(pRExC_state, BOL);
10709         Set_Node_Length(ret, 1); /* MJD */
10710         break;
10711     case '$':
10712         nextchar(pRExC_state);
10713         if (*RExC_parse)
10714             RExC_seen_zerolen++;
10715         if (RExC_flags & RXf_PMf_MULTILINE)
10716             ret = reg_node(pRExC_state, MEOL);
10717         else if (RExC_flags & RXf_PMf_SINGLELINE)
10718             ret = reg_node(pRExC_state, SEOL);
10719         else
10720             ret = reg_node(pRExC_state, EOL);
10721         Set_Node_Length(ret, 1); /* MJD */
10722         break;
10723     case '.':
10724         nextchar(pRExC_state);
10725         if (RExC_flags & RXf_PMf_SINGLELINE)
10726             ret = reg_node(pRExC_state, SANY);
10727         else
10728             ret = reg_node(pRExC_state, REG_ANY);
10729         *flagp |= HASWIDTH|SIMPLE;
10730         RExC_naughty++;
10731         Set_Node_Length(ret, 1); /* MJD */
10732         break;
10733     case '[':
10734     {
10735         char * const oregcomp_parse = ++RExC_parse;
10736         ret = regclass(pRExC_state, flagp,depth+1,
10737                        FALSE, /* means parse the whole char class */
10738                        TRUE, /* allow multi-char folds */
10739                        FALSE, /* don't silence non-portable warnings. */
10740                        NULL);
10741         if (*RExC_parse != ']') {
10742             RExC_parse = oregcomp_parse;
10743             vFAIL("Unmatched [");
10744         }
10745         if (ret == NULL) {
10746             if (*flagp & RESTART_UTF8)
10747                 return NULL;
10748             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10749                   (UV) *flagp);
10750         }
10751         nextchar(pRExC_state);
10752         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10753         break;
10754     }
10755     case '(':
10756         nextchar(pRExC_state);
10757         ret = reg(pRExC_state, 2, &flags,depth+1);
10758         if (ret == NULL) {
10759                 if (flags & TRYAGAIN) {
10760                     if (RExC_parse == RExC_end) {
10761                          /* Make parent create an empty node if needed. */
10762                         *flagp |= TRYAGAIN;
10763                         return(NULL);
10764                     }
10765                     goto tryagain;
10766                 }
10767                 if (flags & RESTART_UTF8) {
10768                     *flagp = RESTART_UTF8;
10769                     return NULL;
10770                 }
10771                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10772         }
10773         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10774         break;
10775     case '|':
10776     case ')':
10777         if (flags & TRYAGAIN) {
10778             *flagp |= TRYAGAIN;
10779             return NULL;
10780         }
10781         vFAIL("Internal urp");
10782                                 /* Supposed to be caught earlier. */
10783         break;
10784     case '{':
10785         if (!regcurly(RExC_parse, FALSE)) {
10786             RExC_parse++;
10787             goto defchar;
10788         }
10789         /* FALL THROUGH */
10790     case '?':
10791     case '+':
10792     case '*':
10793         RExC_parse++;
10794         vFAIL("Quantifier follows nothing");
10795         break;
10796     case '\\':
10797         /* Special Escapes
10798
10799            This switch handles escape sequences that resolve to some kind
10800            of special regop and not to literal text. Escape sequnces that
10801            resolve to literal text are handled below in the switch marked
10802            "Literal Escapes".
10803
10804            Every entry in this switch *must* have a corresponding entry
10805            in the literal escape switch. However, the opposite is not
10806            required, as the default for this switch is to jump to the
10807            literal text handling code.
10808         */
10809         switch ((U8)*++RExC_parse) {
10810             U8 arg;
10811         /* Special Escapes */
10812         case 'A':
10813             RExC_seen_zerolen++;
10814             ret = reg_node(pRExC_state, SBOL);
10815             *flagp |= SIMPLE;
10816             goto finish_meta_pat;
10817         case 'G':
10818             ret = reg_node(pRExC_state, GPOS);
10819             RExC_seen |= REG_SEEN_GPOS;
10820             *flagp |= SIMPLE;
10821             goto finish_meta_pat;
10822         case 'K':
10823             RExC_seen_zerolen++;
10824             ret = reg_node(pRExC_state, KEEPS);
10825             *flagp |= SIMPLE;
10826             /* XXX:dmq : disabling in-place substitution seems to
10827              * be necessary here to avoid cases of memory corruption, as
10828              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10829              */
10830             RExC_seen |= REG_SEEN_LOOKBEHIND;
10831             goto finish_meta_pat;
10832         case 'Z':
10833             ret = reg_node(pRExC_state, SEOL);
10834             *flagp |= SIMPLE;
10835             RExC_seen_zerolen++;                /* Do not optimize RE away */
10836             goto finish_meta_pat;
10837         case 'z':
10838             ret = reg_node(pRExC_state, EOS);
10839             *flagp |= SIMPLE;
10840             RExC_seen_zerolen++;                /* Do not optimize RE away */
10841             goto finish_meta_pat;
10842         case 'C':
10843             ret = reg_node(pRExC_state, CANY);
10844             RExC_seen |= REG_SEEN_CANY;
10845             *flagp |= HASWIDTH|SIMPLE;
10846             goto finish_meta_pat;
10847         case 'X':
10848             ret = reg_node(pRExC_state, CLUMP);
10849             *flagp |= HASWIDTH;
10850             goto finish_meta_pat;
10851
10852         case 'W':
10853             invert = 1;
10854             /* FALLTHROUGH */
10855         case 'w':
10856             arg = ANYOF_WORDCHAR;
10857             goto join_posix;
10858
10859         case 'b':
10860             RExC_seen_zerolen++;
10861             RExC_seen |= REG_SEEN_LOOKBEHIND;
10862             op = BOUND + get_regex_charset(RExC_flags);
10863             if (op > BOUNDA) {  /* /aa is same as /a */
10864                 op = BOUNDA;
10865             }
10866             ret = reg_node(pRExC_state, op);
10867             FLAGS(ret) = get_regex_charset(RExC_flags);
10868             *flagp |= SIMPLE;
10869             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10870                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10871             }
10872             goto finish_meta_pat;
10873         case 'B':
10874             RExC_seen_zerolen++;
10875             RExC_seen |= REG_SEEN_LOOKBEHIND;
10876             op = NBOUND + get_regex_charset(RExC_flags);
10877             if (op > NBOUNDA) { /* /aa is same as /a */
10878                 op = NBOUNDA;
10879             }
10880             ret = reg_node(pRExC_state, op);
10881             FLAGS(ret) = get_regex_charset(RExC_flags);
10882             *flagp |= SIMPLE;
10883             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10884                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10885             }
10886             goto finish_meta_pat;
10887
10888         case 'D':
10889             invert = 1;
10890             /* FALLTHROUGH */
10891         case 'd':
10892             arg = ANYOF_DIGIT;
10893             goto join_posix;
10894
10895         case 'R':
10896             ret = reg_node(pRExC_state, LNBREAK);
10897             *flagp |= HASWIDTH|SIMPLE;
10898             goto finish_meta_pat;
10899
10900         case 'H':
10901             invert = 1;
10902             /* FALLTHROUGH */
10903         case 'h':
10904             arg = ANYOF_BLANK;
10905             op = POSIXU;
10906             goto join_posix_op_known;
10907
10908         case 'V':
10909             invert = 1;
10910             /* FALLTHROUGH */
10911         case 'v':
10912             arg = ANYOF_VERTWS;
10913             op = POSIXU;
10914             goto join_posix_op_known;
10915
10916         case 'S':
10917             invert = 1;
10918             /* FALLTHROUGH */
10919         case 's':
10920             arg = ANYOF_SPACE;
10921
10922         join_posix:
10923
10924             op = POSIXD + get_regex_charset(RExC_flags);
10925             if (op > POSIXA) {  /* /aa is same as /a */
10926                 op = POSIXA;
10927             }
10928
10929         join_posix_op_known:
10930
10931             if (invert) {
10932                 op += NPOSIXD - POSIXD;
10933             }
10934
10935             ret = reg_node(pRExC_state, op);
10936             if (! SIZE_ONLY) {
10937                 FLAGS(ret) = namedclass_to_classnum(arg);
10938             }
10939
10940             *flagp |= HASWIDTH|SIMPLE;
10941             /* FALL THROUGH */
10942
10943          finish_meta_pat:
10944             nextchar(pRExC_state);
10945             Set_Node_Length(ret, 2); /* MJD */
10946             break;
10947         case 'p':
10948         case 'P':
10949             {
10950 #ifdef DEBUGGING
10951                 char* parse_start = RExC_parse - 2;
10952 #endif
10953
10954                 RExC_parse--;
10955
10956                 ret = regclass(pRExC_state, flagp,depth+1,
10957                                TRUE, /* means just parse this element */
10958                                FALSE, /* don't allow multi-char folds */
10959                                FALSE, /* don't silence non-portable warnings.
10960                                          It would be a bug if these returned
10961                                          non-portables */
10962                                NULL);
10963                 /* regclass() can only return RESTART_UTF8 if multi-char folds
10964                    are allowed.  */
10965                 if (!ret)
10966                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10967                           (UV) *flagp);
10968
10969                 RExC_parse--;
10970
10971                 Set_Node_Offset(ret, parse_start + 2);
10972                 Set_Node_Cur_Length(ret, parse_start);
10973                 nextchar(pRExC_state);
10974             }
10975             break;
10976         case 'N':
10977             /* Handle \N and \N{NAME} with multiple code points here and not
10978              * below because it can be multicharacter. join_exact() will join
10979              * them up later on.  Also this makes sure that things like
10980              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10981              * The options to the grok function call causes it to fail if the
10982              * sequence is just a single code point.  We then go treat it as
10983              * just another character in the current EXACT node, and hence it
10984              * gets uniform treatment with all the other characters.  The
10985              * special treatment for quantifiers is not needed for such single
10986              * character sequences */
10987             ++RExC_parse;
10988             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10989                                 FALSE /* not strict */ )) {
10990                 if (*flagp & RESTART_UTF8)
10991                     return NULL;
10992                 RExC_parse--;
10993                 goto defchar;
10994             }
10995             break;
10996         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10997         parse_named_seq:
10998         {
10999             char ch= RExC_parse[1];
11000             if (ch != '<' && ch != '\'' && ch != '{') {
11001                 RExC_parse++;
11002                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11003             } else {
11004                 /* this pretty much dupes the code for (?P=...) in reg(), if
11005                    you change this make sure you change that */
11006                 char* name_start = (RExC_parse += 2);
11007                 U32 num = 0;
11008                 SV *sv_dat = reg_scan_name(pRExC_state,
11009                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11010                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11011                 if (RExC_parse == name_start || *RExC_parse != ch)
11012                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11013
11014                 if (!SIZE_ONLY) {
11015                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11016                     RExC_rxi->data->data[num]=(void*)sv_dat;
11017                     SvREFCNT_inc_simple_void(sv_dat);
11018                 }
11019
11020                 RExC_sawback = 1;
11021                 ret = reganode(pRExC_state,
11022                                ((! FOLD)
11023                                  ? NREF
11024                                  : (ASCII_FOLD_RESTRICTED)
11025                                    ? NREFFA
11026                                    : (AT_LEAST_UNI_SEMANTICS)
11027                                      ? NREFFU
11028                                      : (LOC)
11029                                        ? NREFFL
11030                                        : NREFF),
11031                                 num);
11032                 *flagp |= HASWIDTH;
11033
11034                 /* override incorrect value set in reganode MJD */
11035                 Set_Node_Offset(ret, parse_start+1);
11036                 Set_Node_Cur_Length(ret, parse_start);
11037                 nextchar(pRExC_state);
11038
11039             }
11040             break;
11041         }
11042         case 'g':
11043         case '1': case '2': case '3': case '4':
11044         case '5': case '6': case '7': case '8': case '9':
11045             {
11046                 I32 num;
11047                 bool hasbrace = 0;
11048
11049                 if (*RExC_parse == 'g') {
11050                     bool isrel = 0;
11051
11052                     RExC_parse++;
11053                     if (*RExC_parse == '{') {
11054                         RExC_parse++;
11055                         hasbrace = 1;
11056                     }
11057                     if (*RExC_parse == '-') {
11058                         RExC_parse++;
11059                         isrel = 1;
11060                     }
11061                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11062                         if (isrel) RExC_parse--;
11063                         RExC_parse -= 2;
11064                         goto parse_named_seq;
11065                     }
11066
11067                     num = S_backref_value(RExC_parse);
11068                     if (num == 0)
11069                         vFAIL("Reference to invalid group 0");
11070                     else if (num == I32_MAX) {
11071                          if (isDIGIT(*RExC_parse))
11072                             vFAIL("Reference to nonexistent group");
11073                         else
11074                             vFAIL("Unterminated \\g... pattern");
11075                     }
11076
11077                     if (isrel) {
11078                         num = RExC_npar - num;
11079                         if (num < 1)
11080                             vFAIL("Reference to nonexistent or unclosed group");
11081                     }
11082                 }
11083                 else {
11084                     num = S_backref_value(RExC_parse);
11085                     /* bare \NNN might be backref or octal */
11086                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11087                             && *RExC_parse != '8' && *RExC_parse != '9'))
11088                         /* Probably a character specified in octal, e.g. \35 */
11089                         goto defchar;
11090                 }
11091
11092                 /* at this point RExC_parse definitely points to a backref
11093                  * number */
11094                 {
11095 #ifdef RE_TRACK_PATTERN_OFFSETS
11096                     char * const parse_start = RExC_parse - 1; /* MJD */
11097 #endif
11098                     while (isDIGIT(*RExC_parse))
11099                         RExC_parse++;
11100                     if (hasbrace) {
11101                         if (*RExC_parse != '}')
11102                             vFAIL("Unterminated \\g{...} pattern");
11103                         RExC_parse++;
11104                     }
11105                     if (!SIZE_ONLY) {
11106                         if (num > (I32)RExC_rx->nparens)
11107                             vFAIL("Reference to nonexistent group");
11108                     }
11109                     RExC_sawback = 1;
11110                     ret = reganode(pRExC_state,
11111                                    ((! FOLD)
11112                                      ? REF
11113                                      : (ASCII_FOLD_RESTRICTED)
11114                                        ? REFFA
11115                                        : (AT_LEAST_UNI_SEMANTICS)
11116                                          ? REFFU
11117                                          : (LOC)
11118                                            ? REFFL
11119                                            : REFF),
11120                                     num);
11121                     *flagp |= HASWIDTH;
11122
11123                     /* override incorrect value set in reganode MJD */
11124                     Set_Node_Offset(ret, parse_start+1);
11125                     Set_Node_Cur_Length(ret, parse_start);
11126                     RExC_parse--;
11127                     nextchar(pRExC_state);
11128                 }
11129             }
11130             break;
11131         case '\0':
11132             if (RExC_parse >= RExC_end)
11133                 FAIL("Trailing \\");
11134             /* FALL THROUGH */
11135         default:
11136             /* Do not generate "unrecognized" warnings here, we fall
11137                back into the quick-grab loop below */
11138             parse_start--;
11139             goto defchar;
11140         }
11141         break;
11142
11143     case '#':
11144         if (RExC_flags & RXf_PMf_EXTENDED) {
11145             if ( reg_skipcomment( pRExC_state ) )
11146                 goto tryagain;
11147         }
11148         /* FALL THROUGH */
11149
11150     default:
11151
11152             parse_start = RExC_parse - 1;
11153
11154             RExC_parse++;
11155
11156         defchar: {
11157             STRLEN len = 0;
11158             UV ender = 0;
11159             char *p;
11160             char *s;
11161 #define MAX_NODE_STRING_SIZE 127
11162             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11163             char *s0;
11164             U8 upper_parse = MAX_NODE_STRING_SIZE;
11165             STRLEN foldlen;
11166             U8 node_type = compute_EXACTish(pRExC_state);
11167             bool next_is_quantifier;
11168             char * oldp = NULL;
11169
11170             /* We can convert EXACTF nodes to EXACTFU if they contain only
11171              * characters that match identically regardless of the target
11172              * string's UTF8ness.  The reason to do this is that EXACTF is not
11173              * trie-able, EXACTFU is.  (We don't need to figure this out until
11174              * pass 2) */
11175             bool maybe_exactfu = node_type == EXACTF && PASS2;
11176
11177             /* If a folding node contains only code points that don't
11178              * participate in folds, it can be changed into an EXACT node,
11179              * which allows the optimizer more things to look for */
11180             bool maybe_exact;
11181
11182             ret = reg_node(pRExC_state, node_type);
11183
11184             /* In pass1, folded, we use a temporary buffer instead of the
11185              * actual node, as the node doesn't exist yet */
11186             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11187
11188             s0 = s;
11189
11190         reparse:
11191
11192             /* We do the EXACTFish to EXACT node only if folding, and not if in
11193              * locale, as whether a character folds or not isn't known until
11194              * runtime.  (And we don't need to figure this out until pass 2) */
11195             maybe_exact = FOLD && ! LOC && PASS2;
11196
11197             /* XXX The node can hold up to 255 bytes, yet this only goes to
11198              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11199              * 255 allows us to not have to worry about overflow due to
11200              * converting to utf8 and fold expansion, but that value is
11201              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11202              * split up by this limit into a single one using the real max of
11203              * 255.  Even at 127, this breaks under rare circumstances.  If
11204              * folding, we do not want to split a node at a character that is a
11205              * non-final in a multi-char fold, as an input string could just
11206              * happen to want to match across the node boundary.  The join
11207              * would solve that problem if the join actually happens.  But a
11208              * series of more than two nodes in a row each of 127 would cause
11209              * the first join to succeed to get to 254, but then there wouldn't
11210              * be room for the next one, which could at be one of those split
11211              * multi-char folds.  I don't know of any fool-proof solution.  One
11212              * could back off to end with only a code point that isn't such a
11213              * non-final, but it is possible for there not to be any in the
11214              * entire node. */
11215             for (p = RExC_parse - 1;
11216                  len < upper_parse && p < RExC_end;
11217                  len++)
11218             {
11219                 oldp = p;
11220
11221                 if (RExC_flags & RXf_PMf_EXTENDED)
11222                     p = regwhite( pRExC_state, p );
11223                 switch ((U8)*p) {
11224                 case '^':
11225                 case '$':
11226                 case '.':
11227                 case '[':
11228                 case '(':
11229                 case ')':
11230                 case '|':
11231                     goto loopdone;
11232                 case '\\':
11233                     /* Literal Escapes Switch
11234
11235                        This switch is meant to handle escape sequences that
11236                        resolve to a literal character.
11237
11238                        Every escape sequence that represents something
11239                        else, like an assertion or a char class, is handled
11240                        in the switch marked 'Special Escapes' above in this
11241                        routine, but also has an entry here as anything that
11242                        isn't explicitly mentioned here will be treated as
11243                        an unescaped equivalent literal.
11244                     */
11245
11246                     switch ((U8)*++p) {
11247                     /* These are all the special escapes. */
11248                     case 'A':             /* Start assertion */
11249                     case 'b': case 'B':   /* Word-boundary assertion*/
11250                     case 'C':             /* Single char !DANGEROUS! */
11251                     case 'd': case 'D':   /* digit class */
11252                     case 'g': case 'G':   /* generic-backref, pos assertion */
11253                     case 'h': case 'H':   /* HORIZWS */
11254                     case 'k': case 'K':   /* named backref, keep marker */
11255                     case 'p': case 'P':   /* Unicode property */
11256                               case 'R':   /* LNBREAK */
11257                     case 's': case 'S':   /* space class */
11258                     case 'v': case 'V':   /* VERTWS */
11259                     case 'w': case 'W':   /* word class */
11260                     case 'X':             /* eXtended Unicode "combining character sequence" */
11261                     case 'z': case 'Z':   /* End of line/string assertion */
11262                         --p;
11263                         goto loopdone;
11264
11265                     /* Anything after here is an escape that resolves to a
11266                        literal. (Except digits, which may or may not)
11267                      */
11268                     case 'n':
11269                         ender = '\n';
11270                         p++;
11271                         break;
11272                     case 'N': /* Handle a single-code point named character. */
11273                         /* The options cause it to fail if a multiple code
11274                          * point sequence.  Handle those in the switch() above
11275                          * */
11276                         RExC_parse = p + 1;
11277                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11278                                             flagp, depth, FALSE,
11279                                             FALSE /* not strict */ ))
11280                         {
11281                             if (*flagp & RESTART_UTF8)
11282                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11283                             RExC_parse = p = oldp;
11284                             goto loopdone;
11285                         }
11286                         p = RExC_parse;
11287                         if (ender > 0xff) {
11288                             REQUIRE_UTF8;
11289                         }
11290                         break;
11291                     case 'r':
11292                         ender = '\r';
11293                         p++;
11294                         break;
11295                     case 't':
11296                         ender = '\t';
11297                         p++;
11298                         break;
11299                     case 'f':
11300                         ender = '\f';
11301                         p++;
11302                         break;
11303                     case 'e':
11304                           ender = ASCII_TO_NATIVE('\033');
11305                         p++;
11306                         break;
11307                     case 'a':
11308                           ender = '\a';
11309                         p++;
11310                         break;
11311                     case 'o':
11312                         {
11313                             UV result;
11314                             const char* error_msg;
11315
11316                             bool valid = grok_bslash_o(&p,
11317                                                        &result,
11318                                                        &error_msg,
11319                                                        TRUE, /* out warnings */
11320                                                        FALSE, /* not strict */
11321                                                        TRUE, /* Output warnings
11322                                                                 for non-
11323                                                                 portables */
11324                                                        UTF);
11325                             if (! valid) {
11326                                 RExC_parse = p; /* going to die anyway; point
11327                                                    to exact spot of failure */
11328                                 vFAIL(error_msg);
11329                             }
11330                             ender = result;
11331                             if (PL_encoding && ender < 0x100) {
11332                                 goto recode_encoding;
11333                             }
11334                             if (ender > 0xff) {
11335                                 REQUIRE_UTF8;
11336                             }
11337                             break;
11338                         }
11339                     case 'x':
11340                         {
11341                             UV result = UV_MAX; /* initialize to erroneous
11342                                                    value */
11343                             const char* error_msg;
11344
11345                             bool valid = grok_bslash_x(&p,
11346                                                        &result,
11347                                                        &error_msg,
11348                                                        TRUE, /* out warnings */
11349                                                        FALSE, /* not strict */
11350                                                        TRUE, /* Output warnings
11351                                                                 for non-
11352                                                                 portables */
11353                                                        UTF);
11354                             if (! valid) {
11355                                 RExC_parse = p; /* going to die anyway; point
11356                                                    to exact spot of failure */
11357                                 vFAIL(error_msg);
11358                             }
11359                             ender = result;
11360
11361                             if (PL_encoding && ender < 0x100) {
11362                                 goto recode_encoding;
11363                             }
11364                             if (ender > 0xff) {
11365                                 REQUIRE_UTF8;
11366                             }
11367                             break;
11368                         }
11369                     case 'c':
11370                         p++;
11371                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11372                         break;
11373                     case '8': case '9': /* must be a backreference */
11374                         --p;
11375                         goto loopdone;
11376                     case '1': case '2': case '3':case '4':
11377                     case '5': case '6': case '7':
11378                         /* When we parse backslash escapes there is ambiguity
11379                          * between backreferences and octal escapes. Any escape
11380                          * from \1 - \9 is a backreference, any multi-digit
11381                          * escape which does not start with 0 and which when
11382                          * evaluated as decimal could refer to an already
11383                          * parsed capture buffer is a backslash. Anything else
11384                          * is octal.
11385                          *
11386                          * Note this implies that \118 could be interpreted as
11387                          * 118 OR as "\11" . "8" depending on whether there
11388                          * were 118 capture buffers defined already in the
11389                          * pattern.  */
11390                         if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
11391                         {  /* Not to be treated as an octal constant, go
11392                                    find backref */
11393                             --p;
11394                             goto loopdone;
11395                         }
11396                     case '0':
11397                         {
11398                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11399                             STRLEN numlen = 3;
11400                             ender = grok_oct(p, &numlen, &flags, NULL);
11401                             if (ender > 0xff) {
11402                                 REQUIRE_UTF8;
11403                             }
11404                             p += numlen;
11405                             if (SIZE_ONLY   /* like \08, \178 */
11406                                 && numlen < 3
11407                                 && p < RExC_end
11408                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11409                             {
11410                                 reg_warn_non_literal_string(
11411                                          p + 1,
11412                                          form_short_octal_warning(p, numlen));
11413                             }
11414                         }
11415                         if (PL_encoding && ender < 0x100)
11416                             goto recode_encoding;
11417                         break;
11418                     recode_encoding:
11419                         if (! RExC_override_recoding) {
11420                             SV* enc = PL_encoding;
11421                             ender = reg_recode((const char)(U8)ender, &enc);
11422                             if (!enc && SIZE_ONLY)
11423                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11424                             REQUIRE_UTF8;
11425                         }
11426                         break;
11427                     case '\0':
11428                         if (p >= RExC_end)
11429                             FAIL("Trailing \\");
11430                         /* FALL THROUGH */
11431                     default:
11432                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11433                             /* Include any { following the alpha to emphasize
11434                              * that it could be part of an escape at some point
11435                              * in the future */
11436                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11437                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11438                         }
11439                         goto normal_default;
11440                     } /* End of switch on '\' */
11441                     break;
11442                 default:    /* A literal character */
11443
11444                     if (! SIZE_ONLY
11445                         && RExC_flags & RXf_PMf_EXTENDED
11446                         && ckWARN_d(WARN_DEPRECATED)
11447                         && is_PATWS_non_low(p, UTF))
11448                     {
11449                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11450                                 "Escape literal pattern white space under /x");
11451                     }
11452
11453                   normal_default:
11454                     if (UTF8_IS_START(*p) && UTF) {
11455                         STRLEN numlen;
11456                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11457                                                &numlen, UTF8_ALLOW_DEFAULT);
11458                         p += numlen;
11459                     }
11460                     else
11461                         ender = (U8) *p++;
11462                     break;
11463                 } /* End of switch on the literal */
11464
11465                 /* Here, have looked at the literal character and <ender>
11466                  * contains its ordinal, <p> points to the character after it
11467                  */
11468
11469                 if ( RExC_flags & RXf_PMf_EXTENDED)
11470                     p = regwhite( pRExC_state, p );
11471
11472                 /* If the next thing is a quantifier, it applies to this
11473                  * character only, which means that this character has to be in
11474                  * its own node and can't just be appended to the string in an
11475                  * existing node, so if there are already other characters in
11476                  * the node, close the node with just them, and set up to do
11477                  * this character again next time through, when it will be the
11478                  * only thing in its new node */
11479                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11480                 {
11481                     p = oldp;
11482                     goto loopdone;
11483                 }
11484
11485                 if (! FOLD) {
11486                     if (UTF) {
11487                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11488                         if (unilen > 0) {
11489                            s   += unilen;
11490                            len += unilen;
11491                         }
11492
11493                         /* The loop increments <len> each time, as all but this
11494                          * path (and one other) through it add a single byte to
11495                          * the EXACTish node.  But this one has changed len to
11496                          * be the correct final value, so subtract one to
11497                          * cancel out the increment that follows */
11498                         len--;
11499                     }
11500                     else {
11501                         REGC((char)ender, s++);
11502                     }
11503                 }
11504                 else /* FOLD */ if (! ( UTF
11505                         /* See comments for join_exact() as to why we fold this
11506                          * non-UTF at compile time */
11507                         || (node_type == EXACTFU
11508                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11509                 {
11510                     if (IS_IN_SOME_FOLD_L1(ender)) {
11511                         maybe_exact = FALSE;
11512
11513                         /* See if the character's fold differs between /d and
11514                          * /u.  This includes the multi-char fold SHARP S to
11515                          * 'ss' */
11516                         if (maybe_exactfu
11517                             && (PL_fold[ender] != PL_fold_latin1[ender]
11518                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11519                                 || (len > 0
11520                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11521                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11522                         {
11523                             maybe_exactfu = FALSE;
11524                         }
11525                     }
11526                     *(s++) = (char) ender;
11527                 }
11528                 else {  /* UTF */
11529
11530                     /* Prime the casefolded buffer.  Locale rules, which apply
11531                      * only to code points < 256, aren't known until execution,
11532                      * so for them, just output the original character using
11533                      * utf8.  If we start to fold non-UTF patterns, be sure to
11534                      * update join_exact() */
11535                     if (LOC && ender < 256) {
11536                         if (UVCHR_IS_INVARIANT(ender)) {
11537                             *s = (U8) ender;
11538                             foldlen = 1;
11539                         } else {
11540                             *s = UTF8_TWO_BYTE_HI(ender);
11541                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11542                             foldlen = 2;
11543                         }
11544                     }
11545                     else {
11546                         UV folded = _to_uni_fold_flags(
11547                                        ender,
11548                                        (U8 *) s,
11549                                        &foldlen,
11550                                        FOLD_FLAGS_FULL
11551                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11552                                                 : (ASCII_FOLD_RESTRICTED)
11553                                                   ? FOLD_FLAGS_NOMIX_ASCII
11554                                                   : 0)
11555                                         );
11556
11557                         /* If this node only contains non-folding code points
11558                          * so far, see if this new one is also non-folding */
11559                         if (maybe_exact) {
11560                             if (folded != ender) {
11561                                 maybe_exact = FALSE;
11562                             }
11563                             else {
11564                                 /* Here the fold is the original; we have
11565                                  * to check further to see if anything
11566                                  * folds to it */
11567                                 if (! PL_utf8_foldable) {
11568                                     SV* swash = swash_init("utf8",
11569                                                        "_Perl_Any_Folds",
11570                                                        &PL_sv_undef, 1, 0);
11571                                     PL_utf8_foldable =
11572                                                 _get_swash_invlist(swash);
11573                                     SvREFCNT_dec_NN(swash);
11574                                 }
11575                                 if (_invlist_contains_cp(PL_utf8_foldable,
11576                                                          ender))
11577                                 {
11578                                     maybe_exact = FALSE;
11579                                 }
11580                             }
11581                         }
11582                         ender = folded;
11583                     }
11584                     s += foldlen;
11585
11586                     /* The loop increments <len> each time, as all but this
11587                      * path (and one other) through it add a single byte to the
11588                      * EXACTish node.  But this one has changed len to be the
11589                      * correct final value, so subtract one to cancel out the
11590                      * increment that follows */
11591                     len += foldlen - 1;
11592                 }
11593
11594                 if (next_is_quantifier) {
11595
11596                     /* Here, the next input is a quantifier, and to get here,
11597                      * the current character is the only one in the node.
11598                      * Also, here <len> doesn't include the final byte for this
11599                      * character */
11600                     len++;
11601                     goto loopdone;
11602                 }
11603
11604             } /* End of loop through literal characters */
11605
11606             /* Here we have either exhausted the input or ran out of room in
11607              * the node.  (If we encountered a character that can't be in the
11608              * node, transfer is made directly to <loopdone>, and so we
11609              * wouldn't have fallen off the end of the loop.)  In the latter
11610              * case, we artificially have to split the node into two, because
11611              * we just don't have enough space to hold everything.  This
11612              * creates a problem if the final character participates in a
11613              * multi-character fold in the non-final position, as a match that
11614              * should have occurred won't, due to the way nodes are matched,
11615              * and our artificial boundary.  So back off until we find a non-
11616              * problematic character -- one that isn't at the beginning or
11617              * middle of such a fold.  (Either it doesn't participate in any
11618              * folds, or appears only in the final position of all the folds it
11619              * does participate in.)  A better solution with far fewer false
11620              * positives, and that would fill the nodes more completely, would
11621              * be to actually have available all the multi-character folds to
11622              * test against, and to back-off only far enough to be sure that
11623              * this node isn't ending with a partial one.  <upper_parse> is set
11624              * further below (if we need to reparse the node) to include just
11625              * up through that final non-problematic character that this code
11626              * identifies, so when it is set to less than the full node, we can
11627              * skip the rest of this */
11628             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11629
11630                 const STRLEN full_len = len;
11631
11632                 assert(len >= MAX_NODE_STRING_SIZE);
11633
11634                 /* Here, <s> points to the final byte of the final character.
11635                  * Look backwards through the string until find a non-
11636                  * problematic character */
11637
11638                 if (! UTF) {
11639
11640                     /* These two have no multi-char folds to non-UTF characters
11641                      */
11642                     if (ASCII_FOLD_RESTRICTED || LOC) {
11643                         goto loopdone;
11644                     }
11645
11646                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11647                     len = s - s0 + 1;
11648                 }
11649                 else {
11650                     if (!  PL_NonL1NonFinalFold) {
11651                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11652                                         NonL1_Perl_Non_Final_Folds_invlist);
11653                     }
11654
11655                     /* Point to the first byte of the final character */
11656                     s = (char *) utf8_hop((U8 *) s, -1);
11657
11658                     while (s >= s0) {   /* Search backwards until find
11659                                            non-problematic char */
11660                         if (UTF8_IS_INVARIANT(*s)) {
11661
11662                             /* There are no ascii characters that participate
11663                              * in multi-char folds under /aa.  In EBCDIC, the
11664                              * non-ascii invariants are all control characters,
11665                              * so don't ever participate in any folds. */
11666                             if (ASCII_FOLD_RESTRICTED
11667                                 || ! IS_NON_FINAL_FOLD(*s))
11668                             {
11669                                 break;
11670                             }
11671                         }
11672                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11673
11674                             /* No Latin1 characters participate in multi-char
11675                              * folds under /l */
11676                             if (LOC
11677                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11678                                                                   *s, *(s+1))))
11679                             {
11680                                 break;
11681                             }
11682                         }
11683                         else if (! _invlist_contains_cp(
11684                                         PL_NonL1NonFinalFold,
11685                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11686                         {
11687                             break;
11688                         }
11689
11690                         /* Here, the current character is problematic in that
11691                          * it does occur in the non-final position of some
11692                          * fold, so try the character before it, but have to
11693                          * special case the very first byte in the string, so
11694                          * we don't read outside the string */
11695                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11696                     } /* End of loop backwards through the string */
11697
11698                     /* If there were only problematic characters in the string,
11699                      * <s> will point to before s0, in which case the length
11700                      * should be 0, otherwise include the length of the
11701                      * non-problematic character just found */
11702                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11703                 }
11704
11705                 /* Here, have found the final character, if any, that is
11706                  * non-problematic as far as ending the node without splitting
11707                  * it across a potential multi-char fold.  <len> contains the
11708                  * number of bytes in the node up-to and including that
11709                  * character, or is 0 if there is no such character, meaning
11710                  * the whole node contains only problematic characters.  In
11711                  * this case, give up and just take the node as-is.  We can't
11712                  * do any better */
11713                 if (len == 0) {
11714                     len = full_len;
11715
11716                     /* If the node ends in an 's' we make sure it stays EXACTF,
11717                      * as if it turns into an EXACTFU, it could later get
11718                      * joined with another 's' that would then wrongly match
11719                      * the sharp s */
11720                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11721                     {
11722                         maybe_exactfu = FALSE;
11723                     }
11724                 } else {
11725
11726                     /* Here, the node does contain some characters that aren't
11727                      * problematic.  If one such is the final character in the
11728                      * node, we are done */
11729                     if (len == full_len) {
11730                         goto loopdone;
11731                     }
11732                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11733
11734                         /* If the final character is problematic, but the
11735                          * penultimate is not, back-off that last character to
11736                          * later start a new node with it */
11737                         p = oldp;
11738                         goto loopdone;
11739                     }
11740
11741                     /* Here, the final non-problematic character is earlier
11742                      * in the input than the penultimate character.  What we do
11743                      * is reparse from the beginning, going up only as far as
11744                      * this final ok one, thus guaranteeing that the node ends
11745                      * in an acceptable character.  The reason we reparse is
11746                      * that we know how far in the character is, but we don't
11747                      * know how to correlate its position with the input parse.
11748                      * An alternate implementation would be to build that
11749                      * correlation as we go along during the original parse,
11750                      * but that would entail extra work for every node, whereas
11751                      * this code gets executed only when the string is too
11752                      * large for the node, and the final two characters are
11753                      * problematic, an infrequent occurrence.  Yet another
11754                      * possible strategy would be to save the tail of the
11755                      * string, and the next time regatom is called, initialize
11756                      * with that.  The problem with this is that unless you
11757                      * back off one more character, you won't be guaranteed
11758                      * regatom will get called again, unless regbranch,
11759                      * regpiece ... are also changed.  If you do back off that
11760                      * extra character, so that there is input guaranteed to
11761                      * force calling regatom, you can't handle the case where
11762                      * just the first character in the node is acceptable.  I
11763                      * (khw) decided to try this method which doesn't have that
11764                      * pitfall; if performance issues are found, we can do a
11765                      * combination of the current approach plus that one */
11766                     upper_parse = len;
11767                     len = 0;
11768                     s = s0;
11769                     goto reparse;
11770                 }
11771             }   /* End of verifying node ends with an appropriate char */
11772
11773         loopdone:   /* Jumped to when encounters something that shouldn't be in
11774                        the node */
11775
11776             /* I (khw) don't know if you can get here with zero length, but the
11777              * old code handled this situation by creating a zero-length EXACT
11778              * node.  Might as well be NOTHING instead */
11779             if (len == 0) {
11780                 OP(ret) = NOTHING;
11781             }
11782             else {
11783                 if (FOLD) {
11784                     /* If 'maybe_exact' is still set here, means there are no
11785                      * code points in the node that participate in folds;
11786                      * similarly for 'maybe_exactfu' and code points that match
11787                      * differently depending on UTF8ness of the target string
11788                      * */
11789                     if (maybe_exact) {
11790                         OP(ret) = EXACT;
11791                     }
11792                     else if (maybe_exactfu) {
11793                         OP(ret) = EXACTFU;
11794                     }
11795                 }
11796                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11797             }
11798
11799             RExC_parse = p - 1;
11800             Set_Node_Cur_Length(ret, parse_start);
11801             nextchar(pRExC_state);
11802             {
11803                 /* len is STRLEN which is unsigned, need to copy to signed */
11804                 IV iv = len;
11805                 if (iv < 0)
11806                     vFAIL("Internal disaster");
11807             }
11808
11809         } /* End of label 'defchar:' */
11810         break;
11811     } /* End of giant switch on input character */
11812
11813     return(ret);
11814 }
11815
11816 STATIC char *
11817 S_regwhite( RExC_state_t *pRExC_state, char *p )
11818 {
11819     const char *e = RExC_end;
11820
11821     PERL_ARGS_ASSERT_REGWHITE;
11822
11823     while (p < e) {
11824         if (isSPACE(*p))
11825             ++p;
11826         else if (*p == '#') {
11827             bool ended = 0;
11828             do {
11829                 if (*p++ == '\n') {
11830                     ended = 1;
11831                     break;
11832                 }
11833             } while (p < e);
11834             if (!ended)
11835                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11836         }
11837         else
11838             break;
11839     }
11840     return p;
11841 }
11842
11843 STATIC char *
11844 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11845 {
11846     /* Returns the next non-pattern-white space, non-comment character (the
11847      * latter only if 'recognize_comment is true) in the string p, which is
11848      * ended by RExC_end.  If there is no line break ending a comment,
11849      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11850     const char *e = RExC_end;
11851
11852     PERL_ARGS_ASSERT_REGPATWS;
11853
11854     while (p < e) {
11855         STRLEN len;
11856         if ((len = is_PATWS_safe(p, e, UTF))) {
11857             p += len;
11858         }
11859         else if (recognize_comment && *p == '#') {
11860             bool ended = 0;
11861             do {
11862                 p++;
11863                 if (is_LNBREAK_safe(p, e, UTF)) {
11864                     ended = 1;
11865                     break;
11866                 }
11867             } while (p < e);
11868             if (!ended)
11869                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11870         }
11871         else
11872             break;
11873     }
11874     return p;
11875 }
11876
11877 STATIC void
11878 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
11879 {
11880     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
11881      * sets up the bitmap and any flags, removing those code points from the
11882      * inversion list, setting it to NULL should it become completely empty */
11883
11884     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
11885     assert(PL_regkind[OP(node)] == ANYOF);
11886
11887     ANYOF_BITMAP_ZERO(node);
11888     if (*invlist_ptr) {
11889
11890         /* This gets set if we actually need to modify things */
11891         bool change_invlist = FALSE;
11892
11893         UV start, end;
11894
11895         /* Start looking through *invlist_ptr */
11896         invlist_iterinit(*invlist_ptr);
11897         while (invlist_iternext(*invlist_ptr, &start, &end)) {
11898             UV high;
11899             int i;
11900
11901             if (end == UV_MAX && start <= 256) {
11902                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
11903             }
11904
11905             /* Quit if are above what we should change */
11906             if (start > 255) {
11907                 break;
11908             }
11909
11910             change_invlist = TRUE;
11911
11912             /* Set all the bits in the range, up to the max that we are doing */
11913             high = (end < 255) ? end : 255;
11914             for (i = start; i <= (int) high; i++) {
11915                 if (! ANYOF_BITMAP_TEST(node, i)) {
11916                     ANYOF_BITMAP_SET(node, i);
11917                 }
11918             }
11919         }
11920         invlist_iterfinish(*invlist_ptr);
11921
11922         /* Done with loop; remove any code points that are in the bitmap from
11923          * *invlist_ptr; similarly for code points above latin1 if we have a flag
11924          * to match all of them anyways */
11925         if (change_invlist) {
11926             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
11927         }
11928         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
11929             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
11930         }
11931
11932         /* If have completely emptied it, remove it completely */
11933         if (_invlist_len(*invlist_ptr) == 0) {
11934             SvREFCNT_dec_NN(*invlist_ptr);
11935             *invlist_ptr = NULL;
11936         }
11937     }
11938 }
11939
11940 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11941    Character classes ([:foo:]) can also be negated ([:^foo:]).
11942    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11943    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11944    but trigger failures because they are currently unimplemented. */
11945
11946 #define POSIXCC_DONE(c)   ((c) == ':')
11947 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11948 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11949
11950 PERL_STATIC_INLINE I32
11951 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11952 {
11953     dVAR;
11954     I32 namedclass = OOB_NAMEDCLASS;
11955
11956     PERL_ARGS_ASSERT_REGPPOSIXCC;
11957
11958     if (value == '[' && RExC_parse + 1 < RExC_end &&
11959         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11960         POSIXCC(UCHARAT(RExC_parse)))
11961     {
11962         const char c = UCHARAT(RExC_parse);
11963         char* const s = RExC_parse++;
11964
11965         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11966             RExC_parse++;
11967         if (RExC_parse == RExC_end) {
11968             if (strict) {
11969
11970                 /* Try to give a better location for the error (than the end of
11971                  * the string) by looking for the matching ']' */
11972                 RExC_parse = s;
11973                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11974                     RExC_parse++;
11975                 }
11976                 vFAIL2("Unmatched '%c' in POSIX class", c);
11977             }
11978             /* Grandfather lone [:, [=, [. */
11979             RExC_parse = s;
11980         }
11981         else {
11982             const char* const t = RExC_parse++; /* skip over the c */
11983             assert(*t == c);
11984
11985             if (UCHARAT(RExC_parse) == ']') {
11986                 const char *posixcc = s + 1;
11987                 RExC_parse++; /* skip over the ending ] */
11988
11989                 if (*s == ':') {
11990                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11991                     const I32 skip = t - posixcc;
11992
11993                     /* Initially switch on the length of the name.  */
11994                     switch (skip) {
11995                     case 4:
11996                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11997                                                           this is the Perl \w
11998                                                         */
11999                             namedclass = ANYOF_WORDCHAR;
12000                         break;
12001                     case 5:
12002                         /* Names all of length 5.  */
12003                         /* alnum alpha ascii blank cntrl digit graph lower
12004                            print punct space upper  */
12005                         /* Offset 4 gives the best switch position.  */
12006                         switch (posixcc[4]) {
12007                         case 'a':
12008                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12009                                 namedclass = ANYOF_ALPHA;
12010                             break;
12011                         case 'e':
12012                             if (memEQ(posixcc, "spac", 4)) /* space */
12013                                 namedclass = ANYOF_PSXSPC;
12014                             break;
12015                         case 'h':
12016                             if (memEQ(posixcc, "grap", 4)) /* graph */
12017                                 namedclass = ANYOF_GRAPH;
12018                             break;
12019                         case 'i':
12020                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12021                                 namedclass = ANYOF_ASCII;
12022                             break;
12023                         case 'k':
12024                             if (memEQ(posixcc, "blan", 4)) /* blank */
12025                                 namedclass = ANYOF_BLANK;
12026                             break;
12027                         case 'l':
12028                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12029                                 namedclass = ANYOF_CNTRL;
12030                             break;
12031                         case 'm':
12032                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12033                                 namedclass = ANYOF_ALPHANUMERIC;
12034                             break;
12035                         case 'r':
12036                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12037                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12038                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12039                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12040                             break;
12041                         case 't':
12042                             if (memEQ(posixcc, "digi", 4)) /* digit */
12043                                 namedclass = ANYOF_DIGIT;
12044                             else if (memEQ(posixcc, "prin", 4)) /* print */
12045                                 namedclass = ANYOF_PRINT;
12046                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12047                                 namedclass = ANYOF_PUNCT;
12048                             break;
12049                         }
12050                         break;
12051                     case 6:
12052                         if (memEQ(posixcc, "xdigit", 6))
12053                             namedclass = ANYOF_XDIGIT;
12054                         break;
12055                     }
12056
12057                     if (namedclass == OOB_NAMEDCLASS)
12058                         vFAIL2utf8f(
12059                             "POSIX class [:%"UTF8f":] unknown",
12060                             UTF8fARG(UTF, t - s - 1, s + 1));
12061
12062                     /* The #defines are structured so each complement is +1 to
12063                      * the normal one */
12064                     if (complement) {
12065                         namedclass++;
12066                     }
12067                     assert (posixcc[skip] == ':');
12068                     assert (posixcc[skip+1] == ']');
12069                 } else if (!SIZE_ONLY) {
12070                     /* [[=foo=]] and [[.foo.]] are still future. */
12071
12072                     /* adjust RExC_parse so the warning shows after
12073                        the class closes */
12074                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12075                         RExC_parse++;
12076                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12077                 }
12078             } else {
12079                 /* Maternal grandfather:
12080                  * "[:" ending in ":" but not in ":]" */
12081                 if (strict) {
12082                     vFAIL("Unmatched '[' in POSIX class");
12083                 }
12084
12085                 /* Grandfather lone [:, [=, [. */
12086                 RExC_parse = s;
12087             }
12088         }
12089     }
12090
12091     return namedclass;
12092 }
12093
12094 STATIC bool
12095 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12096 {
12097     /* This applies some heuristics at the current parse position (which should
12098      * be at a '[') to see if what follows might be intended to be a [:posix:]
12099      * class.  It returns true if it really is a posix class, of course, but it
12100      * also can return true if it thinks that what was intended was a posix
12101      * class that didn't quite make it.
12102      *
12103      * It will return true for
12104      *      [:alphanumerics:
12105      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12106      *                         ')' indicating the end of the (?[
12107      *      [:any garbage including %^&$ punctuation:]
12108      *
12109      * This is designed to be called only from S_handle_regex_sets; it could be
12110      * easily adapted to be called from the spot at the beginning of regclass()
12111      * that checks to see in a normal bracketed class if the surrounding []
12112      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12113      * change long-standing behavior, so I (khw) didn't do that */
12114     char* p = RExC_parse + 1;
12115     char first_char = *p;
12116
12117     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12118
12119     assert(*(p - 1) == '[');
12120
12121     if (! POSIXCC(first_char)) {
12122         return FALSE;
12123     }
12124
12125     p++;
12126     while (p < RExC_end && isWORDCHAR(*p)) p++;
12127
12128     if (p >= RExC_end) {
12129         return FALSE;
12130     }
12131
12132     if (p - RExC_parse > 2    /* Got at least 1 word character */
12133         && (*p == first_char
12134             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12135     {
12136         return TRUE;
12137     }
12138
12139     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12140
12141     return (p
12142             && p - RExC_parse > 2 /* [:] evaluates to colon;
12143                                       [::] is a bad posix class. */
12144             && first_char == *(p - 1));
12145 }
12146
12147 STATIC regnode *
12148 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
12149                    char * const oregcomp_parse)
12150 {
12151     /* Handle the (?[...]) construct to do set operations */
12152
12153     U8 curchar;
12154     UV start, end;      /* End points of code point ranges */
12155     SV* result_string;
12156     char *save_end, *save_parse;
12157     SV* final;
12158     STRLEN len;
12159     regnode* node;
12160     AV* stack;
12161     const bool save_fold = FOLD;
12162
12163     GET_RE_DEBUG_FLAGS_DECL;
12164
12165     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12166
12167     if (LOC) {
12168         vFAIL("(?[...]) not valid in locale");
12169     }
12170     RExC_uni_semantics = 1;
12171
12172     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12173      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12174      * call regclass to handle '[]' so as to not have to reinvent its parsing
12175      * rules here (throwing away the size it computes each time).  And, we exit
12176      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12177      * these things, we need to realize that something preceded by a backslash
12178      * is escaped, so we have to keep track of backslashes */
12179     if (SIZE_ONLY) {
12180         UV depth = 0; /* how many nested (?[...]) constructs */
12181
12182         Perl_ck_warner_d(aTHX_
12183             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12184             "The regex_sets feature is experimental" REPORT_LOCATION,
12185                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12186                 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
12187
12188         while (RExC_parse < RExC_end) {
12189             SV* current = NULL;
12190             RExC_parse = regpatws(pRExC_state, RExC_parse,
12191                                 TRUE); /* means recognize comments */
12192             switch (*RExC_parse) {
12193                 case '?':
12194                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12195                     /* FALL THROUGH */
12196                 default:
12197                     break;
12198                 case '\\':
12199                     /* Skip the next byte (which could cause us to end up in
12200                      * the middle of a UTF-8 character, but since none of those
12201                      * are confusable with anything we currently handle in this
12202                      * switch (invariants all), it's safe.  We'll just hit the
12203                      * default: case next time and keep on incrementing until
12204                      * we find one of the invariants we do handle. */
12205                     RExC_parse++;
12206                     break;
12207                 case '[':
12208                 {
12209                     /* If this looks like it is a [:posix:] class, leave the
12210                      * parse pointer at the '[' to fool regclass() into
12211                      * thinking it is part of a '[[:posix:]]'.  That function
12212                      * will use strict checking to force a syntax error if it
12213                      * doesn't work out to a legitimate class */
12214                     bool is_posix_class
12215                                     = could_it_be_a_POSIX_class(pRExC_state);
12216                     if (! is_posix_class) {
12217                         RExC_parse++;
12218                     }
12219
12220                     /* regclass() can only return RESTART_UTF8 if multi-char
12221                        folds are allowed.  */
12222                     if (!regclass(pRExC_state, flagp,depth+1,
12223                                   is_posix_class, /* parse the whole char
12224                                                      class only if not a
12225                                                      posix class */
12226                                   FALSE, /* don't allow multi-char folds */
12227                                   TRUE, /* silence non-portable warnings. */
12228                                   &current))
12229                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12230                               (UV) *flagp);
12231
12232                     /* function call leaves parse pointing to the ']', except
12233                      * if we faked it */
12234                     if (is_posix_class) {
12235                         RExC_parse--;
12236                     }
12237
12238                     SvREFCNT_dec(current);   /* In case it returned something */
12239                     break;
12240                 }
12241
12242                 case ']':
12243                     if (depth--) break;
12244                     RExC_parse++;
12245                     if (RExC_parse < RExC_end
12246                         && *RExC_parse == ')')
12247                     {
12248                         node = reganode(pRExC_state, ANYOF, 0);
12249                         RExC_size += ANYOF_SKIP;
12250                         nextchar(pRExC_state);
12251                         Set_Node_Length(node,
12252                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12253                         return node;
12254                     }
12255                     goto no_close;
12256             }
12257             RExC_parse++;
12258         }
12259
12260         no_close:
12261         FAIL("Syntax error in (?[...])");
12262     }
12263
12264     /* Pass 2 only after this.  Everything in this construct is a
12265      * metacharacter.  Operands begin with either a '\' (for an escape
12266      * sequence), or a '[' for a bracketed character class.  Any other
12267      * character should be an operator, or parenthesis for grouping.  Both
12268      * types of operands are handled by calling regclass() to parse them.  It
12269      * is called with a parameter to indicate to return the computed inversion
12270      * list.  The parsing here is implemented via a stack.  Each entry on the
12271      * stack is a single character representing one of the operators, or the
12272      * '('; or else a pointer to an operand inversion list. */
12273
12274 #define IS_OPERAND(a)  (! SvIOK(a))
12275
12276     /* The stack starts empty.  It is a syntax error if the first thing parsed
12277      * is a binary operator; everything else is pushed on the stack.  When an
12278      * operand is parsed, the top of the stack is examined.  If it is a binary
12279      * operator, the item before it should be an operand, and both are replaced
12280      * by the result of doing that operation on the new operand and the one on
12281      * the stack.   Thus a sequence of binary operands is reduced to a single
12282      * one before the next one is parsed.
12283      *
12284      * A unary operator may immediately follow a binary in the input, for
12285      * example
12286      *      [a] + ! [b]
12287      * When an operand is parsed and the top of the stack is a unary operator,
12288      * the operation is performed, and then the stack is rechecked to see if
12289      * this new operand is part of a binary operation; if so, it is handled as
12290      * above.
12291      *
12292      * A '(' is simply pushed on the stack; it is valid only if the stack is
12293      * empty, or the top element of the stack is an operator or another '('
12294      * (for which the parenthesized expression will become an operand).  By the
12295      * time the corresponding ')' is parsed everything in between should have
12296      * been parsed and evaluated to a single operand (or else is a syntax
12297      * error), and is handled as a regular operand */
12298
12299     sv_2mortal((SV *)(stack = newAV()));
12300
12301     while (RExC_parse < RExC_end) {
12302         I32 top_index = av_tindex(stack);
12303         SV** top_ptr;
12304         SV* current = NULL;
12305
12306         /* Skip white space */
12307         RExC_parse = regpatws(pRExC_state, RExC_parse,
12308                                 TRUE); /* means recognize comments */
12309         if (RExC_parse >= RExC_end) {
12310             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12311         }
12312         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12313             break;
12314         }
12315
12316         switch (curchar) {
12317
12318             case '?':
12319                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12320                                                safely subtract 1 from
12321                                                RExC_parse in the next clause.
12322                                                If we have something on the
12323                                                stack, we have parsed something
12324                                              */
12325                     && UCHARAT(RExC_parse - 1) == '('
12326                     && RExC_parse < RExC_end)
12327                 {
12328                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12329                      * This happens when we have some thing like
12330                      *
12331                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12332                      *   ...
12333                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12334                      *
12335                      * Here we would be handling the interpolated
12336                      * '$thai_or_lao'.  We handle this by a recursive call to
12337                      * ourselves which returns the inversion list the
12338                      * interpolated expression evaluates to.  We use the flags
12339                      * from the interpolated pattern. */
12340                     U32 save_flags = RExC_flags;
12341                     const char * const save_parse = ++RExC_parse;
12342
12343                     parse_lparen_question_flags(pRExC_state);
12344
12345                     if (RExC_parse == save_parse  /* Makes sure there was at
12346                                                      least one flag (or this
12347                                                      embedding wasn't compiled)
12348                                                    */
12349                         || RExC_parse >= RExC_end - 4
12350                         || UCHARAT(RExC_parse) != ':'
12351                         || UCHARAT(++RExC_parse) != '('
12352                         || UCHARAT(++RExC_parse) != '?'
12353                         || UCHARAT(++RExC_parse) != '[')
12354                     {
12355
12356                         /* In combination with the above, this moves the
12357                          * pointer to the point just after the first erroneous
12358                          * character (or if there are no flags, to where they
12359                          * should have been) */
12360                         if (RExC_parse >= RExC_end - 4) {
12361                             RExC_parse = RExC_end;
12362                         }
12363                         else if (RExC_parse != save_parse) {
12364                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12365                         }
12366                         vFAIL("Expecting '(?flags:(?[...'");
12367                     }
12368                     RExC_parse++;
12369                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12370                                                     depth+1, oregcomp_parse);
12371
12372                     /* Here, 'current' contains the embedded expression's
12373                      * inversion list, and RExC_parse points to the trailing
12374                      * ']'; the next character should be the ')' which will be
12375                      * paired with the '(' that has been put on the stack, so
12376                      * the whole embedded expression reduces to '(operand)' */
12377                     RExC_parse++;
12378
12379                     RExC_flags = save_flags;
12380                     goto handle_operand;
12381                 }
12382                 /* FALL THROUGH */
12383
12384             default:
12385                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12386                 vFAIL("Unexpected character");
12387
12388             case '\\':
12389                 /* regclass() can only return RESTART_UTF8 if multi-char
12390                    folds are allowed.  */
12391                 if (!regclass(pRExC_state, flagp,depth+1,
12392                               TRUE, /* means parse just the next thing */
12393                               FALSE, /* don't allow multi-char folds */
12394                               FALSE, /* don't silence non-portable warnings.  */
12395                               &current))
12396                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12397                           (UV) *flagp);
12398                 /* regclass() will return with parsing just the \ sequence,
12399                  * leaving the parse pointer at the next thing to parse */
12400                 RExC_parse--;
12401                 goto handle_operand;
12402
12403             case '[':   /* Is a bracketed character class */
12404             {
12405                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12406
12407                 if (! is_posix_class) {
12408                     RExC_parse++;
12409                 }
12410
12411                 /* regclass() can only return RESTART_UTF8 if multi-char
12412                    folds are allowed.  */
12413                 if(!regclass(pRExC_state, flagp,depth+1,
12414                              is_posix_class, /* parse the whole char class
12415                                                 only if not a posix class */
12416                              FALSE, /* don't allow multi-char folds */
12417                              FALSE, /* don't silence non-portable warnings.  */
12418                              &current))
12419                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12420                           (UV) *flagp);
12421                 /* function call leaves parse pointing to the ']', except if we
12422                  * faked it */
12423                 if (is_posix_class) {
12424                     RExC_parse--;
12425                 }
12426
12427                 goto handle_operand;
12428             }
12429
12430             case '&':
12431             case '|':
12432             case '+':
12433             case '-':
12434             case '^':
12435                 if (top_index < 0
12436                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12437                     || ! IS_OPERAND(*top_ptr))
12438                 {
12439                     RExC_parse++;
12440                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12441                 }
12442                 av_push(stack, newSVuv(curchar));
12443                 break;
12444
12445             case '!':
12446                 av_push(stack, newSVuv(curchar));
12447                 break;
12448
12449             case '(':
12450                 if (top_index >= 0) {
12451                     top_ptr = av_fetch(stack, top_index, FALSE);
12452                     assert(top_ptr);
12453                     if (IS_OPERAND(*top_ptr)) {
12454                         RExC_parse++;
12455                         vFAIL("Unexpected '(' with no preceding operator");
12456                     }
12457                 }
12458                 av_push(stack, newSVuv(curchar));
12459                 break;
12460
12461             case ')':
12462             {
12463                 SV* lparen;
12464                 if (top_index < 1
12465                     || ! (current = av_pop(stack))
12466                     || ! IS_OPERAND(current)
12467                     || ! (lparen = av_pop(stack))
12468                     || IS_OPERAND(lparen)
12469                     || SvUV(lparen) != '(')
12470                 {
12471                     SvREFCNT_dec(current);
12472                     RExC_parse++;
12473                     vFAIL("Unexpected ')'");
12474                 }
12475                 top_index -= 2;
12476                 SvREFCNT_dec_NN(lparen);
12477
12478                 /* FALL THROUGH */
12479             }
12480
12481               handle_operand:
12482
12483                 /* Here, we have an operand to process, in 'current' */
12484
12485                 if (top_index < 0) {    /* Just push if stack is empty */
12486                     av_push(stack, current);
12487                 }
12488                 else {
12489                     SV* top = av_pop(stack);
12490                     SV *prev = NULL;
12491                     char current_operator;
12492
12493                     if (IS_OPERAND(top)) {
12494                         SvREFCNT_dec_NN(top);
12495                         SvREFCNT_dec_NN(current);
12496                         vFAIL("Operand with no preceding operator");
12497                     }
12498                     current_operator = (char) SvUV(top);
12499                     switch (current_operator) {
12500                         case '(':   /* Push the '(' back on followed by the new
12501                                        operand */
12502                             av_push(stack, top);
12503                             av_push(stack, current);
12504                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12505                                                    just after the 'break', so
12506                                                    it doesn't get wrongly freed
12507                                                  */
12508                             break;
12509
12510                         case '!':
12511                             _invlist_invert(current);
12512
12513                             /* Unlike binary operators, the top of the stack,
12514                              * now that this unary one has been popped off, may
12515                              * legally be an operator, and we now have operand
12516                              * for it. */
12517                             top_index--;
12518                             SvREFCNT_dec_NN(top);
12519                             goto handle_operand;
12520
12521                         case '&':
12522                             prev = av_pop(stack);
12523                             _invlist_intersection(prev,
12524                                                    current,
12525                                                    &current);
12526                             av_push(stack, current);
12527                             break;
12528
12529                         case '|':
12530                         case '+':
12531                             prev = av_pop(stack);
12532                             _invlist_union(prev, current, &current);
12533                             av_push(stack, current);
12534                             break;
12535
12536                         case '-':
12537                             prev = av_pop(stack);;
12538                             _invlist_subtract(prev, current, &current);
12539                             av_push(stack, current);
12540                             break;
12541
12542                         case '^':   /* The union minus the intersection */
12543                         {
12544                             SV* i = NULL;
12545                             SV* u = NULL;
12546                             SV* element;
12547
12548                             prev = av_pop(stack);
12549                             _invlist_union(prev, current, &u);
12550                             _invlist_intersection(prev, current, &i);
12551                             /* _invlist_subtract will overwrite current
12552                                 without freeing what it already contains */
12553                             element = current;
12554                             _invlist_subtract(u, i, &current);
12555                             av_push(stack, current);
12556                             SvREFCNT_dec_NN(i);
12557                             SvREFCNT_dec_NN(u);
12558                             SvREFCNT_dec_NN(element);
12559                             break;
12560                         }
12561
12562                         default:
12563                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12564                 }
12565                 SvREFCNT_dec_NN(top);
12566                 SvREFCNT_dec(prev);
12567             }
12568         }
12569
12570         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12571     }
12572
12573     if (av_tindex(stack) < 0   /* Was empty */
12574         || ((final = av_pop(stack)) == NULL)
12575         || ! IS_OPERAND(final)
12576         || av_tindex(stack) >= 0)  /* More left on stack */
12577     {
12578         vFAIL("Incomplete expression within '(?[ ])'");
12579     }
12580
12581     /* Here, 'final' is the resultant inversion list from evaluating the
12582      * expression.  Return it if so requested */
12583     if (return_invlist) {
12584         *return_invlist = final;
12585         return END;
12586     }
12587
12588     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12589      * expecting a string of ranges and individual code points */
12590     invlist_iterinit(final);
12591     result_string = newSVpvs("");
12592     while (invlist_iternext(final, &start, &end)) {
12593         if (start == end) {
12594             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12595         }
12596         else {
12597             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12598                                                      start,          end);
12599         }
12600     }
12601
12602     save_parse = RExC_parse;
12603     RExC_parse = SvPV(result_string, len);
12604     save_end = RExC_end;
12605     RExC_end = RExC_parse + len;
12606
12607     /* We turn off folding around the call, as the class we have constructed
12608      * already has all folding taken into consideration, and we don't want
12609      * regclass() to add to that */
12610     RExC_flags &= ~RXf_PMf_FOLD;
12611     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12612      */
12613     node = regclass(pRExC_state, flagp,depth+1,
12614                     FALSE, /* means parse the whole char class */
12615                     FALSE, /* don't allow multi-char folds */
12616                     TRUE, /* silence non-portable warnings.  The above may very
12617                              well have generated non-portable code points, but
12618                              they're valid on this machine */
12619                     NULL);
12620     if (!node)
12621         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12622                     PTR2UV(flagp));
12623     if (save_fold) {
12624         RExC_flags |= RXf_PMf_FOLD;
12625     }
12626     RExC_parse = save_parse + 1;
12627     RExC_end = save_end;
12628     SvREFCNT_dec_NN(final);
12629     SvREFCNT_dec_NN(result_string);
12630
12631     nextchar(pRExC_state);
12632     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12633     return node;
12634 }
12635 #undef IS_OPERAND
12636
12637 /* The names of properties whose definitions are not known at compile time are
12638  * stored in this SV, after a constant heading.  So if the length has been
12639  * changed since initialization, then there is a run-time definition. */
12640 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12641
12642 STATIC regnode *
12643 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12644                  const bool stop_at_1,  /* Just parse the next thing, don't
12645                                            look for a full character class */
12646                  bool allow_multi_folds,
12647                  const bool silence_non_portable,   /* Don't output warnings
12648                                                        about too large
12649                                                        characters */
12650                  SV** ret_invlist)  /* Return an inversion list, not a node */
12651 {
12652     /* parse a bracketed class specification.  Most of these will produce an
12653      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12654      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12655      * under /i with multi-character folds: it will be rewritten following the
12656      * paradigm of this example, where the <multi-fold>s are characters which
12657      * fold to multiple character sequences:
12658      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12659      * gets effectively rewritten as:
12660      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12661      * reg() gets called (recursively) on the rewritten version, and this
12662      * function will return what it constructs.  (Actually the <multi-fold>s
12663      * aren't physically removed from the [abcdefghi], it's just that they are
12664      * ignored in the recursion by means of a flag:
12665      * <RExC_in_multi_char_class>.)
12666      *
12667      * ANYOF nodes contain a bit map for the first 256 characters, with the
12668      * corresponding bit set if that character is in the list.  For characters
12669      * above 255, a range list or swash is used.  There are extra bits for \w,
12670      * etc. in locale ANYOFs, as what these match is not determinable at
12671      * compile time
12672      *
12673      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12674      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12675      */
12676
12677     dVAR;
12678     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12679     IV range = 0;
12680     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12681     regnode *ret;
12682     STRLEN numlen;
12683     IV namedclass = OOB_NAMEDCLASS;
12684     char *rangebegin = NULL;
12685     bool need_class = 0;
12686     SV *listsv = NULL;
12687     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12688                                       than just initialized.  */
12689     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12690     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12691                                extended beyond the Latin1 range */
12692     UV element_count = 0;   /* Number of distinct elements in the class.
12693                                Optimizations may be possible if this is tiny */
12694     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12695                                        character; used under /i */
12696     UV n;
12697     char * stop_ptr = RExC_end;    /* where to stop parsing */
12698     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12699                                                    space? */
12700     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12701
12702     /* Unicode properties are stored in a swash; this holds the current one
12703      * being parsed.  If this swash is the only above-latin1 component of the
12704      * character class, an optimization is to pass it directly on to the
12705      * execution engine.  Otherwise, it is set to NULL to indicate that there
12706      * are other things in the class that have to be dealt with at execution
12707      * time */
12708     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12709
12710     /* Set if a component of this character class is user-defined; just passed
12711      * on to the engine */
12712     bool has_user_defined_property = FALSE;
12713
12714     /* inversion list of code points this node matches only when the target
12715      * string is in UTF-8.  (Because is under /d) */
12716     SV* depends_list = NULL;
12717
12718     /* inversion list of code points this node matches.  For much of the
12719      * function, it includes only those that match regardless of the utf8ness
12720      * of the target string */
12721     SV* cp_list = NULL;
12722
12723 #ifdef EBCDIC
12724     /* In a range, counts how many 0-2 of the ends of it came from literals,
12725      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12726     UV literal_endpoint = 0;
12727 #endif
12728     bool invert = FALSE;    /* Is this class to be complemented */
12729
12730     /* Is there any thing like \W or [:^digit:] that matches above the legal
12731      * Unicode range? */
12732     bool runtime_posix_matches_above_Unicode = FALSE;
12733
12734     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12735         case we need to change the emitted regop to an EXACT. */
12736     const char * orig_parse = RExC_parse;
12737     const SSize_t orig_size = RExC_size;
12738     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
12739     GET_RE_DEBUG_FLAGS_DECL;
12740
12741     PERL_ARGS_ASSERT_REGCLASS;
12742 #ifndef DEBUGGING
12743     PERL_UNUSED_ARG(depth);
12744 #endif
12745
12746     DEBUG_PARSE("clas");
12747
12748     /* Assume we are going to generate an ANYOF node. */
12749     ret = reganode(pRExC_state, ANYOF, 0);
12750
12751     if (SIZE_ONLY) {
12752         RExC_size += ANYOF_SKIP;
12753         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12754     }
12755     else {
12756         ANYOF_FLAGS(ret) = 0;
12757
12758         RExC_emit += ANYOF_SKIP;
12759         if (LOC) {
12760             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12761         }
12762         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12763         initial_listsv_len = SvCUR(listsv);
12764         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12765     }
12766
12767     if (skip_white) {
12768         RExC_parse = regpatws(pRExC_state, RExC_parse,
12769                               FALSE /* means don't recognize comments */);
12770     }
12771
12772     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12773         RExC_parse++;
12774         invert = TRUE;
12775         allow_multi_folds = FALSE;
12776         RExC_naughty++;
12777         if (skip_white) {
12778             RExC_parse = regpatws(pRExC_state, RExC_parse,
12779                                   FALSE /* means don't recognize comments */);
12780         }
12781     }
12782
12783     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12784     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12785         const char *s = RExC_parse;
12786         const char  c = *s++;
12787
12788         while (isWORDCHAR(*s))
12789             s++;
12790         if (*s && c == *s && s[1] == ']') {
12791             SAVEFREESV(RExC_rx_sv);
12792             ckWARN3reg(s+2,
12793                        "POSIX syntax [%c %c] belongs inside character classes",
12794                        c, c);
12795             (void)ReREFCNT_inc(RExC_rx_sv);
12796         }
12797     }
12798
12799     /* If the caller wants us to just parse a single element, accomplish this
12800      * by faking the loop ending condition */
12801     if (stop_at_1 && RExC_end > RExC_parse) {
12802         stop_ptr = RExC_parse + 1;
12803     }
12804
12805     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12806     if (UCHARAT(RExC_parse) == ']')
12807         goto charclassloop;
12808
12809 parseit:
12810     while (1) {
12811         if  (RExC_parse >= stop_ptr) {
12812             break;
12813         }
12814
12815         if (skip_white) {
12816             RExC_parse = regpatws(pRExC_state, RExC_parse,
12817                                   FALSE /* means don't recognize comments */);
12818         }
12819
12820         if  (UCHARAT(RExC_parse) == ']') {
12821             break;
12822         }
12823
12824     charclassloop:
12825
12826         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12827         save_value = value;
12828         save_prevvalue = prevvalue;
12829
12830         if (!range) {
12831             rangebegin = RExC_parse;
12832             element_count++;
12833         }
12834         if (UTF) {
12835             value = utf8n_to_uvchr((U8*)RExC_parse,
12836                                    RExC_end - RExC_parse,
12837                                    &numlen, UTF8_ALLOW_DEFAULT);
12838             RExC_parse += numlen;
12839         }
12840         else
12841             value = UCHARAT(RExC_parse++);
12842
12843         if (value == '['
12844             && RExC_parse < RExC_end
12845             && POSIXCC(UCHARAT(RExC_parse)))
12846         {
12847             namedclass = regpposixcc(pRExC_state, value, strict);
12848         }
12849         else if (value == '\\') {
12850             if (UTF) {
12851                 value = utf8n_to_uvchr((U8*)RExC_parse,
12852                                    RExC_end - RExC_parse,
12853                                    &numlen, UTF8_ALLOW_DEFAULT);
12854                 RExC_parse += numlen;
12855             }
12856             else
12857                 value = UCHARAT(RExC_parse++);
12858
12859             /* Some compilers cannot handle switching on 64-bit integer
12860              * values, therefore value cannot be an UV.  Yes, this will
12861              * be a problem later if we want switch on Unicode.
12862              * A similar issue a little bit later when switching on
12863              * namedclass. --jhi */
12864
12865             /* If the \ is escaping white space when white space is being
12866              * skipped, it means that that white space is wanted literally, and
12867              * is already in 'value'.  Otherwise, need to translate the escape
12868              * into what it signifies. */
12869             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12870
12871             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12872             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12873             case 's':   namedclass = ANYOF_SPACE;       break;
12874             case 'S':   namedclass = ANYOF_NSPACE;      break;
12875             case 'd':   namedclass = ANYOF_DIGIT;       break;
12876             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12877             case 'v':   namedclass = ANYOF_VERTWS;      break;
12878             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12879             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12880             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12881             case 'N':  /* Handle \N{NAME} in class */
12882                 {
12883                     /* We only pay attention to the first char of
12884                     multichar strings being returned. I kinda wonder
12885                     if this makes sense as it does change the behaviour
12886                     from earlier versions, OTOH that behaviour was broken
12887                     as well. */
12888                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12889                                       TRUE, /* => charclass */
12890                                       strict))
12891                     {
12892                         if (*flagp & RESTART_UTF8)
12893                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
12894                         goto parseit;
12895                     }
12896                 }
12897                 break;
12898             case 'p':
12899             case 'P':
12900                 {
12901                 char *e;
12902
12903                 /* We will handle any undefined properties ourselves */
12904                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12905
12906                 if (RExC_parse >= RExC_end)
12907                     vFAIL2("Empty \\%c{}", (U8)value);
12908                 if (*RExC_parse == '{') {
12909                     const U8 c = (U8)value;
12910                     e = strchr(RExC_parse++, '}');
12911                     if (!e)
12912                         vFAIL2("Missing right brace on \\%c{}", c);
12913                     while (isSPACE(UCHARAT(RExC_parse)))
12914                         RExC_parse++;
12915                     if (e == RExC_parse)
12916                         vFAIL2("Empty \\%c{}", c);
12917                     n = e - RExC_parse;
12918                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12919                         n--;
12920                 }
12921                 else {
12922                     e = RExC_parse;
12923                     n = 1;
12924                 }
12925                 if (!SIZE_ONLY) {
12926                     SV* invlist;
12927                     char* formatted;
12928                     char* name;
12929
12930                     if (UCHARAT(RExC_parse) == '^') {
12931                          RExC_parse++;
12932                          n--;
12933                          /* toggle.  (The rhs xor gets the single bit that
12934                           * differs between P and p; the other xor inverts just
12935                           * that bit) */
12936                          value ^= 'P' ^ 'p';
12937
12938                          while (isSPACE(UCHARAT(RExC_parse))) {
12939                               RExC_parse++;
12940                               n--;
12941                          }
12942                     }
12943                     /* Try to get the definition of the property into
12944                      * <invlist>.  If /i is in effect, the effective property
12945                      * will have its name be <__NAME_i>.  The design is
12946                      * discussed in commit
12947                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12948                     formatted = Perl_form(aTHX_
12949                                           "%s%.*s%s\n",
12950                                           (FOLD) ? "__" : "",
12951                                           (int)n,
12952                                           RExC_parse,
12953                                           (FOLD) ? "_i" : ""
12954                                 );
12955                     name = savepvn(formatted, strlen(formatted));
12956
12957                     /* Look up the property name, and get its swash and
12958                      * inversion list, if the property is found  */
12959                     if (swash) {
12960                         SvREFCNT_dec_NN(swash);
12961                     }
12962                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12963                                              1, /* binary */
12964                                              0, /* not tr/// */
12965                                              NULL, /* No inversion list */
12966                                              &swash_init_flags
12967                                             );
12968                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12969                         if (swash) {
12970                             SvREFCNT_dec_NN(swash);
12971                             swash = NULL;
12972                         }
12973
12974                         /* Here didn't find it.  It could be a user-defined
12975                          * property that will be available at run-time.  If we
12976                          * accept only compile-time properties, is an error;
12977                          * otherwise add it to the list for run-time look up */
12978                         if (ret_invlist) {
12979                             RExC_parse = e + 1;
12980                             vFAIL2utf8f(
12981                                 "Property '%"UTF8f"' is unknown",
12982                                 UTF8fARG(UTF, n, name));
12983                         }
12984                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
12985                                         (value == 'p' ? '+' : '!'),
12986                                         UTF8fARG(UTF, n, name));
12987                         has_user_defined_property = TRUE;
12988
12989                         /* We don't know yet, so have to assume that the
12990                          * property could match something in the Latin1 range,
12991                          * hence something that isn't utf8.  Note that this
12992                          * would cause things in <depends_list> to match
12993                          * inappropriately, except that any \p{}, including
12994                          * this one forces Unicode semantics, which means there
12995                          * is <no depends_list> */
12996                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12997                     }
12998                     else {
12999
13000                         /* Here, did get the swash and its inversion list.  If
13001                          * the swash is from a user-defined property, then this
13002                          * whole character class should be regarded as such */
13003                         has_user_defined_property =
13004                                     (swash_init_flags
13005                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
13006
13007                         /* Invert if asking for the complement */
13008                         if (value == 'P') {
13009                             _invlist_union_complement_2nd(properties,
13010                                                           invlist,
13011                                                           &properties);
13012
13013                             /* The swash can't be used as-is, because we've
13014                              * inverted things; delay removing it to here after
13015                              * have copied its invlist above */
13016                             SvREFCNT_dec_NN(swash);
13017                             swash = NULL;
13018                         }
13019                         else {
13020                             _invlist_union(properties, invlist, &properties);
13021                         }
13022                     }
13023                     Safefree(name);
13024                 }
13025                 RExC_parse = e + 1;
13026                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13027                                                 named */
13028
13029                 /* \p means they want Unicode semantics */
13030                 RExC_uni_semantics = 1;
13031                 }
13032                 break;
13033             case 'n':   value = '\n';                   break;
13034             case 'r':   value = '\r';                   break;
13035             case 't':   value = '\t';                   break;
13036             case 'f':   value = '\f';                   break;
13037             case 'b':   value = '\b';                   break;
13038             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13039             case 'a':   value = '\a';                   break;
13040             case 'o':
13041                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13042                 {
13043                     const char* error_msg;
13044                     bool valid = grok_bslash_o(&RExC_parse,
13045                                                &value,
13046                                                &error_msg,
13047                                                SIZE_ONLY,   /* warnings in pass
13048                                                                1 only */
13049                                                strict,
13050                                                silence_non_portable,
13051                                                UTF);
13052                     if (! valid) {
13053                         vFAIL(error_msg);
13054                     }
13055                 }
13056                 if (PL_encoding && value < 0x100) {
13057                     goto recode_encoding;
13058                 }
13059                 break;
13060             case 'x':
13061                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13062                 {
13063                     const char* error_msg;
13064                     bool valid = grok_bslash_x(&RExC_parse,
13065                                                &value,
13066                                                &error_msg,
13067                                                TRUE, /* Output warnings */
13068                                                strict,
13069                                                silence_non_portable,
13070                                                UTF);
13071                     if (! valid) {
13072                         vFAIL(error_msg);
13073                     }
13074                 }
13075                 if (PL_encoding && value < 0x100)
13076                     goto recode_encoding;
13077                 break;
13078             case 'c':
13079                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
13080                 break;
13081             case '0': case '1': case '2': case '3': case '4':
13082             case '5': case '6': case '7':
13083                 {
13084                     /* Take 1-3 octal digits */
13085                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13086                     numlen = (strict) ? 4 : 3;
13087                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13088                     RExC_parse += numlen;
13089                     if (numlen != 3) {
13090                         if (strict) {
13091                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13092                             vFAIL("Need exactly 3 octal digits");
13093                         }
13094                         else if (! SIZE_ONLY /* like \08, \178 */
13095                                  && numlen < 3
13096                                  && RExC_parse < RExC_end
13097                                  && isDIGIT(*RExC_parse)
13098                                  && ckWARN(WARN_REGEXP))
13099                         {
13100                             SAVEFREESV(RExC_rx_sv);
13101                             reg_warn_non_literal_string(
13102                                  RExC_parse + 1,
13103                                  form_short_octal_warning(RExC_parse, numlen));
13104                             (void)ReREFCNT_inc(RExC_rx_sv);
13105                         }
13106                     }
13107                     if (PL_encoding && value < 0x100)
13108                         goto recode_encoding;
13109                     break;
13110                 }
13111             recode_encoding:
13112                 if (! RExC_override_recoding) {
13113                     SV* enc = PL_encoding;
13114                     value = reg_recode((const char)(U8)value, &enc);
13115                     if (!enc) {
13116                         if (strict) {
13117                             vFAIL("Invalid escape in the specified encoding");
13118                         }
13119                         else if (SIZE_ONLY) {
13120                             ckWARNreg(RExC_parse,
13121                                   "Invalid escape in the specified encoding");
13122                         }
13123                     }
13124                     break;
13125                 }
13126             default:
13127                 /* Allow \_ to not give an error */
13128                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13129                     if (strict) {
13130                         vFAIL2("Unrecognized escape \\%c in character class",
13131                                (int)value);
13132                     }
13133                     else {
13134                         SAVEFREESV(RExC_rx_sv);
13135                         ckWARN2reg(RExC_parse,
13136                             "Unrecognized escape \\%c in character class passed through",
13137                             (int)value);
13138                         (void)ReREFCNT_inc(RExC_rx_sv);
13139                     }
13140                 }
13141                 break;
13142             }   /* End of switch on char following backslash */
13143         } /* end of handling backslash escape sequences */
13144 #ifdef EBCDIC
13145         else
13146             literal_endpoint++;
13147 #endif
13148
13149         /* Here, we have the current token in 'value' */
13150
13151         /* What matches in a locale is not known until runtime.  This includes
13152          * what the Posix classes (like \w, [:space:]) match.  Room must be
13153          * reserved (one time per outer bracketed class) to store such classes,
13154          * either if Perl is compiled so that locale nodes always should have
13155          * this space, or if there is such posix class info to be stored.  The
13156          * space will contain a bit for each named class that is to be matched
13157          * against.  This isn't needed for \p{} and pseudo-classes, as they are
13158          * not affected by locale, and hence are dealt with separately */
13159         if (LOC
13160             && ! need_class
13161             && (ANYOF_LOCALE == ANYOF_POSIXL
13162                 || (namedclass > OOB_NAMEDCLASS
13163                     && namedclass < ANYOF_POSIXL_MAX)))
13164         {
13165             need_class = 1;
13166             if (SIZE_ONLY) {
13167                 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13168             }
13169             else {
13170                 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13171             }
13172             ANYOF_POSIXL_ZERO(ret);
13173             ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13174         }
13175
13176         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13177             U8 classnum;
13178
13179             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13180              * literal, as is the character that began the false range, i.e.
13181              * the 'a' in the examples */
13182             if (range) {
13183                 if (!SIZE_ONLY) {
13184                     const int w = (RExC_parse >= rangebegin)
13185                                   ? RExC_parse - rangebegin
13186                                   : 0;
13187                     if (strict) {
13188                         vFAIL2utf8f(
13189                             "False [] range \"%"UTF8f"\"",
13190                             UTF8fARG(UTF, w, rangebegin));
13191                     }
13192                     else {
13193                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13194                         ckWARN2reg(RExC_parse,
13195                             "False [] range \"%"UTF8f"\"",
13196                             UTF8fARG(UTF, w, rangebegin));
13197                         (void)ReREFCNT_inc(RExC_rx_sv);
13198                         cp_list = add_cp_to_invlist(cp_list, '-');
13199                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
13200                     }
13201                 }
13202
13203                 range = 0; /* this was not a true range */
13204                 element_count += 2; /* So counts for three values */
13205             }
13206
13207             classnum = namedclass_to_classnum(namedclass);
13208
13209             if (LOC && namedclass < ANYOF_POSIXL_MAX
13210 #ifndef HAS_ISASCII
13211                 && classnum != _CC_ASCII
13212 #endif
13213 #ifndef HAS_ISBLANK
13214                 && classnum != _CC_BLANK
13215 #endif
13216             ) {
13217                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13218                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13219                                                             ? -1
13220                                                             : 1)))
13221                 {
13222                     posixl_matches_all = TRUE;
13223                     break;
13224                 }
13225                 ANYOF_POSIXL_SET(ret, namedclass);
13226             }
13227             /* XXX After have made all the posix classes known at compile time
13228              * we can move the LOC handling below to above */
13229
13230             if (! SIZE_ONLY) {
13231                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13232                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13233
13234                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
13235                          * /l make a difference in what these match.  There
13236                          * would be problems if these characters had folds
13237                          * other than themselves, as cp_list is subject to
13238                          * folding. */
13239                         if (classnum != _CC_VERTSPACE) {
13240                             assert(   namedclass == ANYOF_HORIZWS
13241                                    || namedclass == ANYOF_NHORIZWS);
13242
13243                             /* It turns out that \h is just a synonym for
13244                              * XPosixBlank */
13245                             classnum = _CC_BLANK;
13246                         }
13247
13248                         _invlist_union_maybe_complement_2nd(
13249                                 cp_list,
13250                                 PL_XPosix_ptrs[classnum],
13251                                 cBOOL(namedclass % 2), /* Complement if odd
13252                                                           (NHORIZWS, NVERTWS)
13253                                                         */
13254                                 &cp_list);
13255                     }
13256                 }
13257                 else if (classnum == _CC_ASCII) {
13258 #ifdef HAS_ISASCII
13259                     if (LOC) {
13260                         ANYOF_POSIXL_SET(ret, namedclass);
13261                     }
13262                     else
13263 #endif  /* Not isascii(); just use the hard-coded definition for it */
13264                         _invlist_union_maybe_complement_2nd(
13265                                 posixes,
13266                                 PL_Posix_ptrs[_CC_ASCII],
13267                                 cBOOL(namedclass % 2), /* Complement if odd
13268                                                           (NASCII) */
13269                                 &posixes);
13270                 }
13271                 else {  /* Garden variety class */
13272
13273                     /* The ascii range inversion list */
13274                     SV* ascii_source = PL_Posix_ptrs[classnum];
13275
13276                     /* The full Latin1 range inversion list */
13277                     SV* l1_source = PL_L1Posix_ptrs[classnum];
13278
13279                     /* This code is structured into two major clauses.  The
13280                      * first is for classes whose complete definitions may not
13281                      * already be known.  If not, the Latin1 definition
13282                      * (guaranteed to already known) is used plus code is
13283                      * generated to load the rest at run-time (only if needed).
13284                      * If the complete definition is known, it drops down to
13285                      * the second clause, where the complete definition is
13286                      * known */
13287
13288                     if (classnum < _FIRST_NON_SWASH_CC) {
13289
13290                         /* Here, the class has a swash, which may or not
13291                          * already be loaded */
13292
13293                         /* The name of the property to use to match the full
13294                          * eXtended Unicode range swash for this character
13295                          * class */
13296                         const char *Xname = swash_property_names[classnum];
13297
13298                         /* If returning the inversion list, we can't defer
13299                          * getting this until runtime */
13300                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
13301                             PL_utf8_swash_ptrs[classnum] =
13302                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
13303                                              1, /* binary */
13304                                              0, /* not tr/// */
13305                                              NULL, /* No inversion list */
13306                                              NULL  /* No flags */
13307                                             );
13308                             assert(PL_utf8_swash_ptrs[classnum]);
13309                         }
13310                         if ( !  PL_utf8_swash_ptrs[classnum]) {
13311                             if (namedclass % 2 == 0) { /* A non-complemented
13312                                                           class */
13313                                 /* If not /a matching, there are code points we
13314                                  * don't know at compile time.  Arrange for the
13315                                  * unknown matches to be loaded at run-time, if
13316                                  * needed */
13317                                 if (! AT_LEAST_ASCII_RESTRICTED) {
13318                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
13319                                                                  Xname);
13320                                 }
13321                                 if (LOC) {  /* Under locale, set run-time
13322                                                lookup */
13323                                     ANYOF_POSIXL_SET(ret, namedclass);
13324                                 }
13325                                 else {
13326                                     /* Add the current class's code points to
13327                                      * the running total */
13328                                     _invlist_union(posixes,
13329                                                    (AT_LEAST_ASCII_RESTRICTED)
13330                                                         ? ascii_source
13331                                                         : l1_source,
13332                                                    &posixes);
13333                                 }
13334                             }
13335                             else {  /* A complemented class */
13336                                 if (AT_LEAST_ASCII_RESTRICTED) {
13337                                     /* Under /a should match everything above
13338                                      * ASCII, plus the complement of the set's
13339                                      * ASCII matches */
13340                                     _invlist_union_complement_2nd(posixes,
13341                                                                   ascii_source,
13342                                                                   &posixes);
13343                                 }
13344                                 else {
13345                                     /* Arrange for the unknown matches to be
13346                                      * loaded at run-time, if needed */
13347                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
13348                                                                  Xname);
13349                                     runtime_posix_matches_above_Unicode = TRUE;
13350                                     if (LOC) {
13351                                         ANYOF_POSIXL_SET(ret, namedclass);
13352                                     }
13353                                     else {
13354
13355                                         /* We want to match everything in
13356                                          * Latin1, except those things that
13357                                          * l1_source matches */
13358                                         SV* scratch_list = NULL;
13359                                         _invlist_subtract(PL_Latin1, l1_source,
13360                                                           &scratch_list);
13361
13362                                         /* Add the list from this class to the
13363                                          * running total */
13364                                         if (! posixes) {
13365                                             posixes = scratch_list;
13366                                         }
13367                                         else {
13368                                             _invlist_union(posixes,
13369                                                            scratch_list,
13370                                                            &posixes);
13371                                             SvREFCNT_dec_NN(scratch_list);
13372                                         }
13373                                         if (DEPENDS_SEMANTICS) {
13374                                             ANYOF_FLAGS(ret)
13375                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
13376                                         }
13377                                     }
13378                                 }
13379                             }
13380                             goto namedclass_done;
13381                         }
13382
13383                         /* Here, there is a swash loaded for the class.  If no
13384                          * inversion list for it yet, get it */
13385                         if (! PL_XPosix_ptrs[classnum]) {
13386                             PL_XPosix_ptrs[classnum]
13387                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
13388                         }
13389                     }
13390
13391                     /* Here there is an inversion list already loaded for the
13392                      * entire class */
13393
13394                     if (namedclass % 2 == 0) {  /* A non-complemented class,
13395                                                    like ANYOF_PUNCT */
13396                         if (! LOC) {
13397                             /* For non-locale, just add it to any existing list
13398                              * */
13399                             _invlist_union(posixes,
13400                                            (AT_LEAST_ASCII_RESTRICTED)
13401                                                ? ascii_source
13402                                                : PL_XPosix_ptrs[classnum],
13403                                            &posixes);
13404                         }
13405                         else {  /* Locale */
13406                             SV* scratch_list = NULL;
13407
13408                             /* For above Latin1 code points, we use the full
13409                              * Unicode range */
13410                             _invlist_intersection(PL_AboveLatin1,
13411                                                   PL_XPosix_ptrs[classnum],
13412                                                   &scratch_list);
13413                             /* And set the output to it, adding instead if
13414                              * there already is an output.  Checking if
13415                              * 'posixes' is NULL first saves an extra clone.
13416                              * Its reference count will be decremented at the
13417                              * next union, etc, or if this is the only
13418                              * instance, at the end of the routine */
13419                             if (! posixes) {
13420                                 posixes = scratch_list;
13421                             }
13422                             else {
13423                                 _invlist_union(posixes, scratch_list, &posixes);
13424                                 SvREFCNT_dec_NN(scratch_list);
13425                             }
13426
13427 #ifndef HAS_ISBLANK
13428                             if (namedclass != ANYOF_BLANK) {
13429 #endif
13430                                 /* Set this class in the node for runtime
13431                                  * matching */
13432                                 ANYOF_POSIXL_SET(ret, namedclass);
13433 #ifndef HAS_ISBLANK
13434                             }
13435                             else {
13436                                 /* No isblank(), use the hard-coded ASCII-range
13437                                  * blanks, adding them to the running total. */
13438
13439                                 _invlist_union(posixes, ascii_source, &posixes);
13440                             }
13441 #endif
13442                         }
13443                     }
13444                     else {  /* A complemented class, like ANYOF_NPUNCT */
13445                         if (! LOC) {
13446                             _invlist_union_complement_2nd(
13447                                                 posixes,
13448                                                 (AT_LEAST_ASCII_RESTRICTED)
13449                                                     ? ascii_source
13450                                                     : PL_XPosix_ptrs[classnum],
13451                                                 &posixes);
13452                             /* Under /d, everything in the upper half of the
13453                              * Latin1 range matches this complement */
13454                             if (DEPENDS_SEMANTICS) {
13455                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13456                             }
13457                         }
13458                         else {  /* Locale */
13459                             SV* scratch_list = NULL;
13460                             _invlist_subtract(PL_AboveLatin1,
13461                                               PL_XPosix_ptrs[classnum],
13462                                               &scratch_list);
13463                             if (! posixes) {
13464                                 posixes = scratch_list;
13465                             }
13466                             else {
13467                                 _invlist_union(posixes, scratch_list, &posixes);
13468                                 SvREFCNT_dec_NN(scratch_list);
13469                             }
13470 #ifndef HAS_ISBLANK
13471                             if (namedclass != ANYOF_NBLANK) {
13472 #endif
13473                                 ANYOF_POSIXL_SET(ret, namedclass);
13474 #ifndef HAS_ISBLANK
13475                             }
13476                             else {
13477                                 /* Get the list of all code points in Latin1
13478                                  * that are not ASCII blanks, and add them to
13479                                  * the running total */
13480                                 _invlist_subtract(PL_Latin1, ascii_source,
13481                                                   &scratch_list);
13482                                 _invlist_union(posixes, scratch_list, &posixes);
13483                                 SvREFCNT_dec_NN(scratch_list);
13484                             }
13485 #endif
13486                         }
13487                     }
13488                 }
13489               namedclass_done:
13490                 continue;   /* Go get next character */
13491             }
13492         } /* end of namedclass \blah */
13493
13494         /* Here, we have a single value.  If 'range' is set, it is the ending
13495          * of a range--check its validity.  Later, we will handle each
13496          * individual code point in the range.  If 'range' isn't set, this
13497          * could be the beginning of a range, so check for that by looking
13498          * ahead to see if the next real character to be processed is the range
13499          * indicator--the minus sign */
13500
13501         if (skip_white) {
13502             RExC_parse = regpatws(pRExC_state, RExC_parse,
13503                                 FALSE /* means don't recognize comments */);
13504         }
13505
13506         if (range) {
13507             if (prevvalue > value) /* b-a */ {
13508                 const int w = RExC_parse - rangebegin;
13509                 vFAIL2utf8f(
13510                     "Invalid [] range \"%"UTF8f"\"",
13511                     UTF8fARG(UTF, w, rangebegin));
13512                 range = 0; /* not a valid range */
13513             }
13514         }
13515         else {
13516             prevvalue = value; /* save the beginning of the potential range */
13517             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13518                 && *RExC_parse == '-')
13519             {
13520                 char* next_char_ptr = RExC_parse + 1;
13521                 if (skip_white) {   /* Get the next real char after the '-' */
13522                     next_char_ptr = regpatws(pRExC_state,
13523                                              RExC_parse + 1,
13524                                              FALSE); /* means don't recognize
13525                                                         comments */
13526                 }
13527
13528                 /* If the '-' is at the end of the class (just before the ']',
13529                  * it is a literal minus; otherwise it is a range */
13530                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13531                     RExC_parse = next_char_ptr;
13532
13533                     /* a bad range like \w-, [:word:]- ? */
13534                     if (namedclass > OOB_NAMEDCLASS) {
13535                         if (strict || ckWARN(WARN_REGEXP)) {
13536                             const int w =
13537                                 RExC_parse >= rangebegin ?
13538                                 RExC_parse - rangebegin : 0;
13539                             if (strict) {
13540                                 vFAIL4("False [] range \"%*.*s\"",
13541                                     w, w, rangebegin);
13542                             }
13543                             else {
13544                                 vWARN4(RExC_parse,
13545                                     "False [] range \"%*.*s\"",
13546                                     w, w, rangebegin);
13547                             }
13548                         }
13549                         if (!SIZE_ONLY) {
13550                             cp_list = add_cp_to_invlist(cp_list, '-');
13551                         }
13552                         element_count++;
13553                     } else
13554                         range = 1;      /* yeah, it's a range! */
13555                     continue;   /* but do it the next time */
13556                 }
13557             }
13558         }
13559
13560         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13561          * if not */
13562
13563         /* non-Latin1 code point implies unicode semantics.  Must be set in
13564          * pass1 so is there for the whole of pass 2 */
13565         if (value > 255) {
13566             RExC_uni_semantics = 1;
13567         }
13568
13569         /* Ready to process either the single value, or the completed range.
13570          * For single-valued non-inverted ranges, we consider the possibility
13571          * of multi-char folds.  (We made a conscious decision to not do this
13572          * for the other cases because it can often lead to non-intuitive
13573          * results.  For example, you have the peculiar case that:
13574          *  "s s" =~ /^[^\xDF]+$/i => Y
13575          *  "ss"  =~ /^[^\xDF]+$/i => N
13576          *
13577          * See [perl #89750] */
13578         if (FOLD && allow_multi_folds && value == prevvalue) {
13579             if (value == LATIN_SMALL_LETTER_SHARP_S
13580                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13581                                                         value)))
13582             {
13583                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13584
13585                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13586                 STRLEN foldlen;
13587
13588                 UV folded = _to_uni_fold_flags(
13589                                 value,
13590                                 foldbuf,
13591                                 &foldlen,
13592                                 FOLD_FLAGS_FULL
13593                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13594                                             : (ASCII_FOLD_RESTRICTED)
13595                                               ? FOLD_FLAGS_NOMIX_ASCII
13596                                               : 0)
13597                                 );
13598
13599                 /* Here, <folded> should be the first character of the
13600                  * multi-char fold of <value>, with <foldbuf> containing the
13601                  * whole thing.  But, if this fold is not allowed (because of
13602                  * the flags), <fold> will be the same as <value>, and should
13603                  * be processed like any other character, so skip the special
13604                  * handling */
13605                 if (folded != value) {
13606
13607                     /* Skip if we are recursed, currently parsing the class
13608                      * again.  Otherwise add this character to the list of
13609                      * multi-char folds. */
13610                     if (! RExC_in_multi_char_class) {
13611                         AV** this_array_ptr;
13612                         AV* this_array;
13613                         STRLEN cp_count = utf8_length(foldbuf,
13614                                                       foldbuf + foldlen);
13615                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13616
13617                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13618
13619
13620                         if (! multi_char_matches) {
13621                             multi_char_matches = newAV();
13622                         }
13623
13624                         /* <multi_char_matches> is actually an array of arrays.
13625                          * There will be one or two top-level elements: [2],
13626                          * and/or [3].  The [2] element is an array, each
13627                          * element thereof is a character which folds to TWO
13628                          * characters; [3] is for folds to THREE characters.
13629                          * (Unicode guarantees a maximum of 3 characters in any
13630                          * fold.)  When we rewrite the character class below,
13631                          * we will do so such that the longest folds are
13632                          * written first, so that it prefers the longest
13633                          * matching strings first.  This is done even if it
13634                          * turns out that any quantifier is non-greedy, out of
13635                          * programmer laziness.  Tom Christiansen has agreed
13636                          * that this is ok.  This makes the test for the
13637                          * ligature 'ffi' come before the test for 'ff' */
13638                         if (av_exists(multi_char_matches, cp_count)) {
13639                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13640                                                              cp_count, FALSE);
13641                             this_array = *this_array_ptr;
13642                         }
13643                         else {
13644                             this_array = newAV();
13645                             av_store(multi_char_matches, cp_count,
13646                                      (SV*) this_array);
13647                         }
13648                         av_push(this_array, multi_fold);
13649                     }
13650
13651                     /* This element should not be processed further in this
13652                      * class */
13653                     element_count--;
13654                     value = save_value;
13655                     prevvalue = save_prevvalue;
13656                     continue;
13657                 }
13658             }
13659         }
13660
13661         /* Deal with this element of the class */
13662         if (! SIZE_ONLY) {
13663 #ifndef EBCDIC
13664             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13665 #else
13666             SV* this_range = _new_invlist(1);
13667             _append_range_to_invlist(this_range, prevvalue, value);
13668
13669             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13670              * If this range was specified using something like 'i-j', we want
13671              * to include only the 'i' and the 'j', and not anything in
13672              * between, so exclude non-ASCII, non-alphabetics from it.
13673              * However, if the range was specified with something like
13674              * [\x89-\x91] or [\x89-j], all code points within it should be
13675              * included.  literal_endpoint==2 means both ends of the range used
13676              * a literal character, not \x{foo} */
13677             if (literal_endpoint == 2
13678                 && ((prevvalue >= 'a' && value <= 'z')
13679                     || (prevvalue >= 'A' && value <= 'Z')))
13680             {
13681                 _invlist_intersection(this_range, PL_ASCII,
13682                                       &this_range);
13683                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13684                                       &this_range);
13685             }
13686             _invlist_union(cp_list, this_range, &cp_list);
13687             literal_endpoint = 0;
13688 #endif
13689         }
13690
13691         range = 0; /* this range (if it was one) is done now */
13692     } /* End of loop through all the text within the brackets */
13693
13694     /* If anything in the class expands to more than one character, we have to
13695      * deal with them by building up a substitute parse string, and recursively
13696      * calling reg() on it, instead of proceeding */
13697     if (multi_char_matches) {
13698         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13699         I32 cp_count;
13700         STRLEN len;
13701         char *save_end = RExC_end;
13702         char *save_parse = RExC_parse;
13703         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13704                                        a "|" */
13705         I32 reg_flags;
13706
13707         assert(! invert);
13708 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13709            because too confusing */
13710         if (invert) {
13711             sv_catpv(substitute_parse, "(?:");
13712         }
13713 #endif
13714
13715         /* Look at the longest folds first */
13716         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13717
13718             if (av_exists(multi_char_matches, cp_count)) {
13719                 AV** this_array_ptr;
13720                 SV* this_sequence;
13721
13722                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13723                                                  cp_count, FALSE);
13724                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13725                                                                 &PL_sv_undef)
13726                 {
13727                     if (! first_time) {
13728                         sv_catpv(substitute_parse, "|");
13729                     }
13730                     first_time = FALSE;
13731
13732                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13733                 }
13734             }
13735         }
13736
13737         /* If the character class contains anything else besides these
13738          * multi-character folds, have to include it in recursive parsing */
13739         if (element_count) {
13740             sv_catpv(substitute_parse, "|[");
13741             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13742             sv_catpv(substitute_parse, "]");
13743         }
13744
13745         sv_catpv(substitute_parse, ")");
13746 #if 0
13747         if (invert) {
13748             /* This is a way to get the parse to skip forward a whole named
13749              * sequence instead of matching the 2nd character when it fails the
13750              * first */
13751             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13752         }
13753 #endif
13754
13755         RExC_parse = SvPV(substitute_parse, len);
13756         RExC_end = RExC_parse + len;
13757         RExC_in_multi_char_class = 1;
13758         RExC_emit = (regnode *)orig_emit;
13759
13760         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13761
13762         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13763
13764         RExC_parse = save_parse;
13765         RExC_end = save_end;
13766         RExC_in_multi_char_class = 0;
13767         SvREFCNT_dec_NN(multi_char_matches);
13768         return ret;
13769     }
13770
13771     /* If the character class contains only a single element, it may be
13772      * optimizable into another node type which is smaller and runs faster.
13773      * Check if this is the case for this class */
13774     if ((element_count == 1 && ! ret_invlist)
13775         || UNLIKELY(posixl_matches_all))
13776     {
13777         U8 op = END;
13778         U8 arg = 0;
13779
13780         if (UNLIKELY(posixl_matches_all)) {
13781             op = SANY;
13782         }
13783         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
13784                                                    \w or [:digit:] or \p{foo}
13785                                                  */
13786
13787             /* All named classes are mapped into POSIXish nodes, with its FLAG
13788              * argument giving which class it is */
13789             switch ((I32)namedclass) {
13790                 case ANYOF_UNIPROP:
13791                     break;
13792
13793                 /* These don't depend on the charset modifiers.  They always
13794                  * match under /u rules */
13795                 case ANYOF_NHORIZWS:
13796                 case ANYOF_HORIZWS:
13797                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13798                     /* FALLTHROUGH */
13799
13800                 case ANYOF_NVERTWS:
13801                 case ANYOF_VERTWS:
13802                     op = POSIXU;
13803                     goto join_posix;
13804
13805                 /* The actual POSIXish node for all the rest depends on the
13806                  * charset modifier.  The ones in the first set depend only on
13807                  * ASCII or, if available on this platform, locale */
13808                 case ANYOF_ASCII:
13809                 case ANYOF_NASCII:
13810 #ifdef HAS_ISASCII
13811                     op = (LOC) ? POSIXL : POSIXA;
13812 #else
13813                     op = POSIXA;
13814 #endif
13815                     goto join_posix;
13816
13817                 case ANYOF_NCASED:
13818                 case ANYOF_LOWER:
13819                 case ANYOF_NLOWER:
13820                 case ANYOF_UPPER:
13821                 case ANYOF_NUPPER:
13822                     /* under /a could be alpha */
13823                     if (FOLD) {
13824                         if (ASCII_RESTRICTED) {
13825                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13826                         }
13827                         else if (! LOC) {
13828                             break;
13829                         }
13830                     }
13831                     /* FALLTHROUGH */
13832
13833                 /* The rest have more possibilities depending on the charset.
13834                  * We take advantage of the enum ordering of the charset
13835                  * modifiers to get the exact node type, */
13836                 default:
13837                     op = POSIXD + get_regex_charset(RExC_flags);
13838                     if (op > POSIXA) { /* /aa is same as /a */
13839                         op = POSIXA;
13840                     }
13841 #ifndef HAS_ISBLANK
13842                     if (op == POSIXL
13843                         && (namedclass == ANYOF_BLANK
13844                             || namedclass == ANYOF_NBLANK))
13845                     {
13846                         op = POSIXA;
13847                     }
13848 #endif
13849
13850                 join_posix:
13851                     /* The odd numbered ones are the complements of the
13852                      * next-lower even number one */
13853                     if (namedclass % 2 == 1) {
13854                         invert = ! invert;
13855                         namedclass--;
13856                     }
13857                     arg = namedclass_to_classnum(namedclass);
13858                     break;
13859             }
13860         }
13861         else if (value == prevvalue) {
13862
13863             /* Here, the class consists of just a single code point */
13864
13865             if (invert) {
13866                 if (! LOC && value == '\n') {
13867                     op = REG_ANY; /* Optimize [^\n] */
13868                     *flagp |= HASWIDTH|SIMPLE;
13869                     RExC_naughty++;
13870                 }
13871             }
13872             else if (value < 256 || UTF) {
13873
13874                 /* Optimize a single value into an EXACTish node, but not if it
13875                  * would require converting the pattern to UTF-8. */
13876                 op = compute_EXACTish(pRExC_state);
13877             }
13878         } /* Otherwise is a range */
13879         else if (! LOC) {   /* locale could vary these */
13880             if (prevvalue == '0') {
13881                 if (value == '9') {
13882                     arg = _CC_DIGIT;
13883                     op = POSIXA;
13884                 }
13885             }
13886         }
13887
13888         /* Here, we have changed <op> away from its initial value iff we found
13889          * an optimization */
13890         if (op != END) {
13891
13892             /* Throw away this ANYOF regnode, and emit the calculated one,
13893              * which should correspond to the beginning, not current, state of
13894              * the parse */
13895             const char * cur_parse = RExC_parse;
13896             RExC_parse = (char *)orig_parse;
13897             if ( SIZE_ONLY) {
13898                 if (! LOC) {
13899
13900                     /* To get locale nodes to not use the full ANYOF size would
13901                      * require moving the code above that writes the portions
13902                      * of it that aren't in other nodes to after this point.
13903                      * e.g.  ANYOF_POSIXL_SET */
13904                     RExC_size = orig_size;
13905                 }
13906             }
13907             else {
13908                 RExC_emit = (regnode *)orig_emit;
13909                 if (PL_regkind[op] == POSIXD) {
13910                     if (invert) {
13911                         op += NPOSIXD - POSIXD;
13912                     }
13913                 }
13914             }
13915
13916             ret = reg_node(pRExC_state, op);
13917
13918             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13919                 if (! SIZE_ONLY) {
13920                     FLAGS(ret) = arg;
13921                 }
13922                 *flagp |= HASWIDTH|SIMPLE;
13923             }
13924             else if (PL_regkind[op] == EXACT) {
13925                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13926             }
13927
13928             RExC_parse = (char *) cur_parse;
13929
13930             SvREFCNT_dec(posixes);
13931             SvREFCNT_dec(cp_list);
13932             return ret;
13933         }
13934     }
13935
13936     if (SIZE_ONLY)
13937         return ret;
13938     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13939
13940     /* If folding, we calculate all characters that could fold to or from the
13941      * ones already on the list */
13942     if (FOLD && cp_list) {
13943         UV start, end;  /* End points of code point ranges */
13944
13945         SV* fold_intersection = NULL;
13946
13947         /* If the highest code point is within Latin1, we can use the
13948          * compiled-in Alphas list, and not have to go out to disk.  This
13949          * yields two false positives, the masculine and feminine ordinal
13950          * indicators, which are weeded out below using the
13951          * IS_IN_SOME_FOLD_L1() macro */
13952         if (invlist_highest(cp_list) < 256) {
13953             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13954                                                            &fold_intersection);
13955         }
13956         else {
13957
13958             /* Here, there are non-Latin1 code points, so we will have to go
13959              * fetch the list of all the characters that participate in folds
13960              */
13961             if (! PL_utf8_foldable) {
13962                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13963                                        &PL_sv_undef, 1, 0);
13964                 PL_utf8_foldable = _get_swash_invlist(swash);
13965                 SvREFCNT_dec_NN(swash);
13966             }
13967
13968             /* This is a hash that for a particular fold gives all characters
13969              * that are involved in it */
13970             if (! PL_utf8_foldclosures) {
13971
13972                 /* If we were unable to find any folds, then we likely won't be
13973                  * able to find the closures.  So just create an empty list.
13974                  * Folding will effectively be restricted to the non-Unicode
13975                  * rules hard-coded into Perl.  (This case happens legitimately
13976                  * during compilation of Perl itself before the Unicode tables
13977                  * are generated) */
13978                 if (_invlist_len(PL_utf8_foldable) == 0) {
13979                     PL_utf8_foldclosures = newHV();
13980                 }
13981                 else {
13982                     /* If the folds haven't been read in, call a fold function
13983                      * to force that */
13984                     if (! PL_utf8_tofold) {
13985                         U8 dummy[UTF8_MAXBYTES_CASE+1];
13986
13987                         /* This string is just a short named one above \xff */
13988                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13989                         assert(PL_utf8_tofold); /* Verify that worked */
13990                     }
13991                     PL_utf8_foldclosures =
13992                                     _swash_inversion_hash(PL_utf8_tofold);
13993                 }
13994             }
13995
13996             /* Only the characters in this class that participate in folds need
13997              * be checked.  Get the intersection of this class and all the
13998              * possible characters that are foldable.  This can quickly narrow
13999              * down a large class */
14000             _invlist_intersection(PL_utf8_foldable, cp_list,
14001                                   &fold_intersection);
14002         }
14003
14004         /* Now look at the foldable characters in this class individually */
14005         invlist_iterinit(fold_intersection);
14006         while (invlist_iternext(fold_intersection, &start, &end)) {
14007             UV j;
14008
14009             /* Locale folding for Latin1 characters is deferred until runtime */
14010             if (LOC && start < 256) {
14011                 start = 256;
14012             }
14013
14014             /* Look at every character in the range */
14015             for (j = start; j <= end; j++) {
14016
14017                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14018                 STRLEN foldlen;
14019                 SV** listp;
14020
14021                 if (j < 256) {
14022
14023                     /* We have the latin1 folding rules hard-coded here so that
14024                      * an innocent-looking character class, like /[ks]/i won't
14025                      * have to go out to disk to find the possible matches.
14026                      * XXX It would be better to generate these via regen, in
14027                      * case a new version of the Unicode standard adds new
14028                      * mappings, though that is not really likely, and may be
14029                      * caught by the default: case of the switch below. */
14030
14031                     if (IS_IN_SOME_FOLD_L1(j)) {
14032
14033                         /* ASCII is always matched; non-ASCII is matched only
14034                          * under Unicode rules */
14035                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
14036                             cp_list =
14037                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
14038                         }
14039                         else {
14040                             depends_list =
14041                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
14042                         }
14043                     }
14044
14045                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14046                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14047                     {
14048                         /* Certain Latin1 characters have matches outside
14049                          * Latin1.  To get here, <j> is one of those
14050                          * characters.   None of these matches is valid for
14051                          * ASCII characters under /aa, which is why the 'if'
14052                          * just above excludes those.  These matches only
14053                          * happen when the target string is utf8.  The code
14054                          * below adds the single fold closures for <j> to the
14055                          * inversion list. */
14056                         switch (j) {
14057                             case 'k':
14058                             case 'K':
14059                                 cp_list =
14060                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
14061                                 break;
14062                             case 's':
14063                             case 'S':
14064                                 cp_list = add_cp_to_invlist(cp_list,
14065                                                     LATIN_SMALL_LETTER_LONG_S);
14066                                 break;
14067                             case MICRO_SIGN:
14068                                 cp_list = add_cp_to_invlist(cp_list,
14069                                                     GREEK_CAPITAL_LETTER_MU);
14070                                 cp_list = add_cp_to_invlist(cp_list,
14071                                                     GREEK_SMALL_LETTER_MU);
14072                                 break;
14073                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14074                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14075                                 cp_list =
14076                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
14077                                 break;
14078                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14079                                 cp_list = add_cp_to_invlist(cp_list,
14080                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14081                                 break;
14082                             case LATIN_SMALL_LETTER_SHARP_S:
14083                                 cp_list = add_cp_to_invlist(cp_list,
14084                                                 LATIN_CAPITAL_LETTER_SHARP_S);
14085                                 break;
14086                             case 'F': case 'f':
14087                             case 'I': case 'i':
14088                             case 'L': case 'l':
14089                             case 'T': case 't':
14090                             case 'A': case 'a':
14091                             case 'H': case 'h':
14092                             case 'J': case 'j':
14093                             case 'N': case 'n':
14094                             case 'W': case 'w':
14095                             case 'Y': case 'y':
14096                                 /* These all are targets of multi-character
14097                                  * folds from code points that require UTF8 to
14098                                  * express, so they can't match unless the
14099                                  * target string is in UTF-8, so no action here
14100                                  * is necessary, as regexec.c properly handles
14101                                  * the general case for UTF-8 matching and
14102                                  * multi-char folds */
14103                                 break;
14104                             default:
14105                                 /* Use deprecated warning to increase the
14106                                  * chances of this being output */
14107                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14108                                 break;
14109                         }
14110                     }
14111                     continue;
14112                 }
14113
14114                 /* Here is an above Latin1 character.  We don't have the rules
14115                  * hard-coded for it.  First, get its fold.  This is the simple
14116                  * fold, as the multi-character folds have been handled earlier
14117                  * and separated out */
14118                 _to_uni_fold_flags(j, foldbuf, &foldlen,
14119                                                ((LOC)
14120                                                ? FOLD_FLAGS_LOCALE
14121                                                : (ASCII_FOLD_RESTRICTED)
14122                                                   ? FOLD_FLAGS_NOMIX_ASCII
14123                                                   : 0));
14124
14125                 /* Single character fold of above Latin1.  Add everything in
14126                  * its fold closure to the list that this node should match.
14127                  * The fold closures data structure is a hash with the keys
14128                  * being the UTF-8 of every character that is folded to, like
14129                  * 'k', and the values each an array of all code points that
14130                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14131                  * Multi-character folds are not included */
14132                 if ((listp = hv_fetch(PL_utf8_foldclosures,
14133                                       (char *) foldbuf, foldlen, FALSE)))
14134                 {
14135                     AV* list = (AV*) *listp;
14136                     IV k;
14137                     for (k = 0; k <= av_len(list); k++) {
14138                         SV** c_p = av_fetch(list, k, FALSE);
14139                         UV c;
14140                         if (c_p == NULL) {
14141                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14142                         }
14143                         c = SvUV(*c_p);
14144
14145                         /* /aa doesn't allow folds between ASCII and non-; /l
14146                          * doesn't allow them between above and below 256 */
14147                         if ((ASCII_FOLD_RESTRICTED
14148                                   && (isASCII(c) != isASCII(j)))
14149                             || (LOC && c < 256)) {
14150                             continue;
14151                         }
14152
14153                         /* Folds involving non-ascii Latin1 characters
14154                          * under /d are added to a separate list */
14155                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14156                         {
14157                             cp_list = add_cp_to_invlist(cp_list, c);
14158                         }
14159                         else {
14160                           depends_list = add_cp_to_invlist(depends_list, c);
14161                         }
14162                     }
14163                 }
14164             }
14165         }
14166         SvREFCNT_dec_NN(fold_intersection);
14167     }
14168
14169     /* And combine the result (if any) with any inversion list from posix
14170      * classes.  The lists are kept separate up to now because we don't want to
14171      * fold the classes (folding of those is automatically handled by the swash
14172      * fetching code) */
14173     if (posixes) {
14174         if (! DEPENDS_SEMANTICS) {
14175             if (cp_list) {
14176                 _invlist_union(cp_list, posixes, &cp_list);
14177                 SvREFCNT_dec_NN(posixes);
14178             }
14179             else {
14180                 cp_list = posixes;
14181             }
14182         }
14183         else {
14184             /* Under /d, we put into a separate list the Latin1 things that
14185              * match only when the target string is utf8 */
14186             SV* nonascii_but_latin1_properties = NULL;
14187             _invlist_intersection(posixes, PL_UpperLatin1,
14188                                   &nonascii_but_latin1_properties);
14189             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14190                               &posixes);
14191             if (cp_list) {
14192                 _invlist_union(cp_list, posixes, &cp_list);
14193                 SvREFCNT_dec_NN(posixes);
14194             }
14195             else {
14196                 cp_list = posixes;
14197             }
14198
14199             if (depends_list) {
14200                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14201                                &depends_list);
14202                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14203             }
14204             else {
14205                 depends_list = nonascii_but_latin1_properties;
14206             }
14207         }
14208     }
14209
14210     /* And combine the result (if any) with any inversion list from properties.
14211      * The lists are kept separate up to now so that we can distinguish the two
14212      * in regards to matching above-Unicode.  A run-time warning is generated
14213      * if a Unicode property is matched against a non-Unicode code point. But,
14214      * we allow user-defined properties to match anything, without any warning,
14215      * and we also suppress the warning if there is a portion of the character
14216      * class that isn't a Unicode property, and which matches above Unicode, \W
14217      * or [\x{110000}] for example.
14218      * (Note that in this case, unlike the Posix one above, there is no
14219      * <depends_list>, because having a Unicode property forces Unicode
14220      * semantics */
14221     if (properties) {
14222         bool warn_super = ! has_user_defined_property;
14223         if (cp_list) {
14224
14225             /* If it matters to the final outcome, see if a non-property
14226              * component of the class matches above Unicode.  If so, the
14227              * warning gets suppressed.  This is true even if just a single
14228              * such code point is specified, as though not strictly correct if
14229              * another such code point is matched against, the fact that they
14230              * are using above-Unicode code points indicates they should know
14231              * the issues involved */
14232             if (warn_super) {
14233                 bool non_prop_matches_above_Unicode =
14234                             runtime_posix_matches_above_Unicode
14235                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
14236                 if (invert) {
14237                     non_prop_matches_above_Unicode =
14238                                             !  non_prop_matches_above_Unicode;
14239                 }
14240                 warn_super = ! non_prop_matches_above_Unicode;
14241             }
14242
14243             _invlist_union(properties, cp_list, &cp_list);
14244             SvREFCNT_dec_NN(properties);
14245         }
14246         else {
14247             cp_list = properties;
14248         }
14249
14250         if (warn_super) {
14251             OP(ret) = ANYOF_WARN_SUPER;
14252         }
14253     }
14254
14255     /* Here, we have calculated what code points should be in the character
14256      * class.
14257      *
14258      * Now we can see about various optimizations.  Fold calculation (which we
14259      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14260      * would invert to include K, which under /i would match k, which it
14261      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14262      * folded until runtime */
14263
14264     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14265      * at compile time.  Besides not inverting folded locale now, we can't
14266      * invert if there are things such as \w, which aren't known until runtime
14267      * */
14268     if (invert
14269         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_POSIXL)))
14270         && ! depends_list
14271         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14272     {
14273         _invlist_invert(cp_list);
14274
14275         /* Any swash can't be used as-is, because we've inverted things */
14276         if (swash) {
14277             SvREFCNT_dec_NN(swash);
14278             swash = NULL;
14279         }
14280
14281         /* Clear the invert flag since have just done it here */
14282         invert = FALSE;
14283     }
14284
14285     if (ret_invlist) {
14286         *ret_invlist = cp_list;
14287         SvREFCNT_dec(swash);
14288
14289         /* Discard the generated node */
14290         if (SIZE_ONLY) {
14291             RExC_size = orig_size;
14292         }
14293         else {
14294             RExC_emit = orig_emit;
14295         }
14296         return orig_emit;
14297     }
14298
14299     /* If we didn't do folding, it's because some information isn't available
14300      * until runtime; set the run-time fold flag for these.  (We don't have to
14301      * worry about properties folding, as that is taken care of by the swash
14302      * fetching) */
14303     if (FOLD && LOC)
14304     {
14305        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14306     }
14307
14308     /* Some character classes are equivalent to other nodes.  Such nodes take
14309      * up less room and generally fewer operations to execute than ANYOF nodes.
14310      * Above, we checked for and optimized into some such equivalents for
14311      * certain common classes that are easy to test.  Getting to this point in
14312      * the code means that the class didn't get optimized there.  Since this
14313      * code is only executed in Pass 2, it is too late to save space--it has
14314      * been allocated in Pass 1, and currently isn't given back.  But turning
14315      * things into an EXACTish node can allow the optimizer to join it to any
14316      * adjacent such nodes.  And if the class is equivalent to things like /./,
14317      * expensive run-time swashes can be avoided.  Now that we have more
14318      * complete information, we can find things necessarily missed by the
14319      * earlier code.  I (khw) am not sure how much to look for here.  It would
14320      * be easy, but perhaps too slow, to check any candidates against all the
14321      * node types they could possibly match using _invlistEQ(). */
14322
14323     if (cp_list
14324         && ! invert
14325         && ! depends_list
14326         && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
14327         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14328     {
14329         UV start, end;
14330         U8 op = END;  /* The optimzation node-type */
14331         const char * cur_parse= RExC_parse;
14332
14333         invlist_iterinit(cp_list);
14334         if (! invlist_iternext(cp_list, &start, &end)) {
14335
14336             /* Here, the list is empty.  This happens, for example, when a
14337              * Unicode property is the only thing in the character class, and
14338              * it doesn't match anything.  (perluniprops.pod notes such
14339              * properties) */
14340             op = OPFAIL;
14341             *flagp |= HASWIDTH|SIMPLE;
14342         }
14343         else if (start == end) {    /* The range is a single code point */
14344             if (! invlist_iternext(cp_list, &start, &end)
14345
14346                     /* Don't do this optimization if it would require changing
14347                      * the pattern to UTF-8 */
14348                 && (start < 256 || UTF))
14349             {
14350                 /* Here, the list contains a single code point.  Can optimize
14351                  * into an EXACT node */
14352
14353                 value = start;
14354
14355                 if (! FOLD) {
14356                     op = EXACT;
14357                 }
14358                 else if (LOC) {
14359
14360                     /* A locale node under folding with one code point can be
14361                      * an EXACTFL, as its fold won't be calculated until
14362                      * runtime */
14363                     op = EXACTFL;
14364                 }
14365                 else {
14366
14367                     /* Here, we are generally folding, but there is only one
14368                      * code point to match.  If we have to, we use an EXACT
14369                      * node, but it would be better for joining with adjacent
14370                      * nodes in the optimization pass if we used the same
14371                      * EXACTFish node that any such are likely to be.  We can
14372                      * do this iff the code point doesn't participate in any
14373                      * folds.  For example, an EXACTF of a colon is the same as
14374                      * an EXACT one, since nothing folds to or from a colon. */
14375                     if (value < 256) {
14376                         if (IS_IN_SOME_FOLD_L1(value)) {
14377                             op = EXACT;
14378                         }
14379                     }
14380                     else {
14381                         if (! PL_utf8_foldable) {
14382                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
14383                                                 &PL_sv_undef, 1, 0);
14384                             PL_utf8_foldable = _get_swash_invlist(swash);
14385                             SvREFCNT_dec_NN(swash);
14386                         }
14387                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14388                             op = EXACT;
14389                         }
14390                     }
14391
14392                     /* If we haven't found the node type, above, it means we
14393                      * can use the prevailing one */
14394                     if (op == END) {
14395                         op = compute_EXACTish(pRExC_state);
14396                     }
14397                 }
14398             }
14399         }
14400         else if (start == 0) {
14401             if (end == UV_MAX) {
14402                 op = SANY;
14403                 *flagp |= HASWIDTH|SIMPLE;
14404                 RExC_naughty++;
14405             }
14406             else if (end == '\n' - 1
14407                     && invlist_iternext(cp_list, &start, &end)
14408                     && start == '\n' + 1 && end == UV_MAX)
14409             {
14410                 op = REG_ANY;
14411                 *flagp |= HASWIDTH|SIMPLE;
14412                 RExC_naughty++;
14413             }
14414         }
14415         invlist_iterfinish(cp_list);
14416
14417         if (op != END) {
14418             RExC_parse = (char *)orig_parse;
14419             RExC_emit = (regnode *)orig_emit;
14420
14421             ret = reg_node(pRExC_state, op);
14422
14423             RExC_parse = (char *)cur_parse;
14424
14425             if (PL_regkind[op] == EXACT) {
14426                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
14427             }
14428
14429             SvREFCNT_dec_NN(cp_list);
14430             return ret;
14431         }
14432     }
14433
14434     /* Here, <cp_list> contains all the code points we can determine at
14435      * compile time that match under all conditions.  Go through it, and
14436      * for things that belong in the bitmap, put them there, and delete from
14437      * <cp_list>.  While we are at it, see if everything above 255 is in the
14438      * list, and if so, set a flag to speed up execution */
14439
14440     populate_ANYOF_from_invlist(ret, &cp_list);
14441
14442     if (invert) {
14443         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14444     }
14445
14446     /* Here, the bitmap has been populated with all the Latin1 code points that
14447      * always match.  Can now add to the overall list those that match only
14448      * when the target string is UTF-8 (<depends_list>). */
14449     if (depends_list) {
14450         if (cp_list) {
14451             _invlist_union(cp_list, depends_list, &cp_list);
14452             SvREFCNT_dec_NN(depends_list);
14453         }
14454         else {
14455             cp_list = depends_list;
14456         }
14457     }
14458
14459     /* If there is a swash and more than one element, we can't use the swash in
14460      * the optimization below. */
14461     if (swash && element_count > 1) {
14462         SvREFCNT_dec_NN(swash);
14463         swash = NULL;
14464     }
14465
14466     set_ANYOF_arg(pRExC_state, ret, cp_list,
14467                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14468                    ? listsv : NULL,
14469                   swash, has_user_defined_property);
14470
14471     *flagp |= HASWIDTH|SIMPLE;
14472     return ret;
14473 }
14474
14475 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14476
14477 STATIC void
14478 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14479                 regnode* const node,
14480                 SV* const cp_list,
14481                 SV* const runtime_defns,
14482                 SV* const swash,
14483                 const bool has_user_defined_property)
14484 {
14485     /* Sets the arg field of an ANYOF-type node 'node', using information about
14486      * the node passed-in.  If there is nothing outside the node's bitmap, the
14487      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14488      * the count returned by add_data(), having allocated and stored an array,
14489      * av, that that count references, as follows:
14490      *  av[0] stores the character class description in its textual form.
14491      *        This is used later (regexec.c:Perl_regclass_swash()) to
14492      *        initialize the appropriate swash, and is also useful for dumping
14493      *        the regnode.  This is set to &PL_sv_undef if the textual
14494      *        description is not needed at run-time (as happens if the other
14495      *        elements completely define the class)
14496      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14497      *        computed from av[0].  But if no further computation need be done,
14498      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14499      *  av[2] stores the cp_list inversion list for use in addition or instead
14500      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14501      *        (Otherwise everything needed is already in av[0] and av[1])
14502      *  av[3] is set if any component of the class is from a user-defined
14503      *        property; used only if av[2] exists */
14504
14505     UV n;
14506
14507     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14508
14509     if (! cp_list && ! runtime_defns) {
14510         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14511     }
14512     else {
14513         AV * const av = newAV();
14514         SV *rv;
14515
14516         av_store(av, 0, (runtime_defns)
14517                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14518         if (swash) {
14519             av_store(av, 1, swash);
14520             SvREFCNT_dec_NN(cp_list);
14521         }
14522         else {
14523             av_store(av, 1, &PL_sv_undef);
14524             if (cp_list) {
14525                 av_store(av, 2, cp_list);
14526                 av_store(av, 3, newSVuv(has_user_defined_property));
14527             }
14528         }
14529
14530         rv = newRV_noinc(MUTABLE_SV(av));
14531         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14532         RExC_rxi->data->data[n] = (void*)rv;
14533         ARG_SET(node, n);
14534     }
14535 }
14536
14537
14538 /* reg_skipcomment()
14539
14540    Absorbs an /x style # comments from the input stream.
14541    Returns true if there is more text remaining in the stream.
14542    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14543    terminates the pattern without including a newline.
14544
14545    Note its the callers responsibility to ensure that we are
14546    actually in /x mode
14547
14548 */
14549
14550 STATIC bool
14551 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14552 {
14553     bool ended = 0;
14554
14555     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14556
14557     while (RExC_parse < RExC_end)
14558         if (*RExC_parse++ == '\n') {
14559             ended = 1;
14560             break;
14561         }
14562     if (!ended) {
14563         /* we ran off the end of the pattern without ending
14564            the comment, so we have to add an \n when wrapping */
14565         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14566         return 0;
14567     } else
14568         return 1;
14569 }
14570
14571 /* nextchar()
14572
14573    Advances the parse position, and optionally absorbs
14574    "whitespace" from the inputstream.
14575
14576    Without /x "whitespace" means (?#...) style comments only,
14577    with /x this means (?#...) and # comments and whitespace proper.
14578
14579    Returns the RExC_parse point from BEFORE the scan occurs.
14580
14581    This is the /x friendly way of saying RExC_parse++.
14582 */
14583
14584 STATIC char*
14585 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14586 {
14587     char* const retval = RExC_parse++;
14588
14589     PERL_ARGS_ASSERT_NEXTCHAR;
14590
14591     for (;;) {
14592         if (RExC_end - RExC_parse >= 3
14593             && *RExC_parse == '('
14594             && RExC_parse[1] == '?'
14595             && RExC_parse[2] == '#')
14596         {
14597             while (*RExC_parse != ')') {
14598                 if (RExC_parse == RExC_end)
14599                     FAIL("Sequence (?#... not terminated");
14600                 RExC_parse++;
14601             }
14602             RExC_parse++;
14603             continue;
14604         }
14605         if (RExC_flags & RXf_PMf_EXTENDED) {
14606             if (isSPACE(*RExC_parse)) {
14607                 RExC_parse++;
14608                 continue;
14609             }
14610             else if (*RExC_parse == '#') {
14611                 if ( reg_skipcomment( pRExC_state ) )
14612                     continue;
14613             }
14614         }
14615         return retval;
14616     }
14617 }
14618
14619 /*
14620 - reg_node - emit a node
14621 */
14622 STATIC regnode *                        /* Location. */
14623 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14624 {
14625     dVAR;
14626     regnode *ptr;
14627     regnode * const ret = RExC_emit;
14628     GET_RE_DEBUG_FLAGS_DECL;
14629
14630     PERL_ARGS_ASSERT_REG_NODE;
14631
14632     if (SIZE_ONLY) {
14633         SIZE_ALIGN(RExC_size);
14634         RExC_size += 1;
14635         return(ret);
14636     }
14637     if (RExC_emit >= RExC_emit_bound)
14638         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14639                    op, RExC_emit, RExC_emit_bound);
14640
14641     NODE_ALIGN_FILL(ret);
14642     ptr = ret;
14643     FILL_ADVANCE_NODE(ptr, op);
14644 #ifdef RE_TRACK_PATTERN_OFFSETS
14645     if (RExC_offsets) {         /* MJD */
14646         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14647               "reg_node", __LINE__,
14648               PL_reg_name[op],
14649               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14650                 ? "Overwriting end of array!\n" : "OK",
14651               (UV)(RExC_emit - RExC_emit_start),
14652               (UV)(RExC_parse - RExC_start),
14653               (UV)RExC_offsets[0]));
14654         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14655     }
14656 #endif
14657     RExC_emit = ptr;
14658     return(ret);
14659 }
14660
14661 /*
14662 - reganode - emit a node with an argument
14663 */
14664 STATIC regnode *                        /* Location. */
14665 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14666 {
14667     dVAR;
14668     regnode *ptr;
14669     regnode * const ret = RExC_emit;
14670     GET_RE_DEBUG_FLAGS_DECL;
14671
14672     PERL_ARGS_ASSERT_REGANODE;
14673
14674     if (SIZE_ONLY) {
14675         SIZE_ALIGN(RExC_size);
14676         RExC_size += 2;
14677         /*
14678            We can't do this:
14679
14680            assert(2==regarglen[op]+1);
14681
14682            Anything larger than this has to allocate the extra amount.
14683            If we changed this to be:
14684
14685            RExC_size += (1 + regarglen[op]);
14686
14687            then it wouldn't matter. Its not clear what side effect
14688            might come from that so its not done so far.
14689            -- dmq
14690         */
14691         return(ret);
14692     }
14693     if (RExC_emit >= RExC_emit_bound)
14694         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14695                    op, RExC_emit, RExC_emit_bound);
14696
14697     NODE_ALIGN_FILL(ret);
14698     ptr = ret;
14699     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14700 #ifdef RE_TRACK_PATTERN_OFFSETS
14701     if (RExC_offsets) {         /* MJD */
14702         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14703               "reganode",
14704               __LINE__,
14705               PL_reg_name[op],
14706               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14707               "Overwriting end of array!\n" : "OK",
14708               (UV)(RExC_emit - RExC_emit_start),
14709               (UV)(RExC_parse - RExC_start),
14710               (UV)RExC_offsets[0]));
14711         Set_Cur_Node_Offset;
14712     }
14713 #endif
14714     RExC_emit = ptr;
14715     return(ret);
14716 }
14717
14718 /*
14719 - reguni - emit (if appropriate) a Unicode character
14720 */
14721 STATIC STRLEN
14722 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14723 {
14724     dVAR;
14725
14726     PERL_ARGS_ASSERT_REGUNI;
14727
14728     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14729 }
14730
14731 /*
14732 - reginsert - insert an operator in front of already-emitted operand
14733 *
14734 * Means relocating the operand.
14735 */
14736 STATIC void
14737 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14738 {
14739     dVAR;
14740     regnode *src;
14741     regnode *dst;
14742     regnode *place;
14743     const int offset = regarglen[(U8)op];
14744     const int size = NODE_STEP_REGNODE + offset;
14745     GET_RE_DEBUG_FLAGS_DECL;
14746
14747     PERL_ARGS_ASSERT_REGINSERT;
14748     PERL_UNUSED_ARG(depth);
14749 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14750     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14751     if (SIZE_ONLY) {
14752         RExC_size += size;
14753         return;
14754     }
14755
14756     src = RExC_emit;
14757     RExC_emit += size;
14758     dst = RExC_emit;
14759     if (RExC_open_parens) {
14760         int paren;
14761         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14762         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14763             if ( RExC_open_parens[paren] >= opnd ) {
14764                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14765                 RExC_open_parens[paren] += size;
14766             } else {
14767                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14768             }
14769             if ( RExC_close_parens[paren] >= opnd ) {
14770                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14771                 RExC_close_parens[paren] += size;
14772             } else {
14773                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14774             }
14775         }
14776     }
14777
14778     while (src > opnd) {
14779         StructCopy(--src, --dst, regnode);
14780 #ifdef RE_TRACK_PATTERN_OFFSETS
14781         if (RExC_offsets) {     /* MJD 20010112 */
14782             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14783                   "reg_insert",
14784                   __LINE__,
14785                   PL_reg_name[op],
14786                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14787                     ? "Overwriting end of array!\n" : "OK",
14788                   (UV)(src - RExC_emit_start),
14789                   (UV)(dst - RExC_emit_start),
14790                   (UV)RExC_offsets[0]));
14791             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14792             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14793         }
14794 #endif
14795     }
14796
14797
14798     place = opnd;               /* Op node, where operand used to be. */
14799 #ifdef RE_TRACK_PATTERN_OFFSETS
14800     if (RExC_offsets) {         /* MJD */
14801         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14802               "reginsert",
14803               __LINE__,
14804               PL_reg_name[op],
14805               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14806               ? "Overwriting end of array!\n" : "OK",
14807               (UV)(place - RExC_emit_start),
14808               (UV)(RExC_parse - RExC_start),
14809               (UV)RExC_offsets[0]));
14810         Set_Node_Offset(place, RExC_parse);
14811         Set_Node_Length(place, 1);
14812     }
14813 #endif
14814     src = NEXTOPER(place);
14815     FILL_ADVANCE_NODE(place, op);
14816     Zero(src, offset, regnode);
14817 }
14818
14819 /*
14820 - regtail - set the next-pointer at the end of a node chain of p to val.
14821 - SEE ALSO: regtail_study
14822 */
14823 /* TODO: All three parms should be const */
14824 STATIC void
14825 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14826 {
14827     dVAR;
14828     regnode *scan;
14829     GET_RE_DEBUG_FLAGS_DECL;
14830
14831     PERL_ARGS_ASSERT_REGTAIL;
14832 #ifndef DEBUGGING
14833     PERL_UNUSED_ARG(depth);
14834 #endif
14835
14836     if (SIZE_ONLY)
14837         return;
14838
14839     /* Find last node. */
14840     scan = p;
14841     for (;;) {
14842         regnode * const temp = regnext(scan);
14843         DEBUG_PARSE_r({
14844             SV * const mysv=sv_newmortal();
14845             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14846             regprop(RExC_rx, mysv, scan);
14847             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14848                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14849                     (temp == NULL ? "->" : ""),
14850                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14851             );
14852         });
14853         if (temp == NULL)
14854             break;
14855         scan = temp;
14856     }
14857
14858     if (reg_off_by_arg[OP(scan)]) {
14859         ARG_SET(scan, val - scan);
14860     }
14861     else {
14862         NEXT_OFF(scan) = val - scan;
14863     }
14864 }
14865
14866 #ifdef DEBUGGING
14867 /*
14868 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14869 - Look for optimizable sequences at the same time.
14870 - currently only looks for EXACT chains.
14871
14872 This is experimental code. The idea is to use this routine to perform
14873 in place optimizations on branches and groups as they are constructed,
14874 with the long term intention of removing optimization from study_chunk so
14875 that it is purely analytical.
14876
14877 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14878 to control which is which.
14879
14880 */
14881 /* TODO: All four parms should be const */
14882
14883 STATIC U8
14884 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14885 {
14886     dVAR;
14887     regnode *scan;
14888     U8 exact = PSEUDO;
14889 #ifdef EXPERIMENTAL_INPLACESCAN
14890     I32 min = 0;
14891 #endif
14892     GET_RE_DEBUG_FLAGS_DECL;
14893
14894     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14895
14896
14897     if (SIZE_ONLY)
14898         return exact;
14899
14900     /* Find last node. */
14901
14902     scan = p;
14903     for (;;) {
14904         regnode * const temp = regnext(scan);
14905 #ifdef EXPERIMENTAL_INPLACESCAN
14906         if (PL_regkind[OP(scan)] == EXACT) {
14907             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14908             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14909                 return EXACT;
14910         }
14911 #endif
14912         if ( exact ) {
14913             switch (OP(scan)) {
14914                 case EXACT:
14915                 case EXACTF:
14916                 case EXACTFA_NO_TRIE:
14917                 case EXACTFA:
14918                 case EXACTFU:
14919                 case EXACTFU_SS:
14920                 case EXACTFL:
14921                         if( exact == PSEUDO )
14922                             exact= OP(scan);
14923                         else if ( exact != OP(scan) )
14924                             exact= 0;
14925                 case NOTHING:
14926                     break;
14927                 default:
14928                     exact= 0;
14929             }
14930         }
14931         DEBUG_PARSE_r({
14932             SV * const mysv=sv_newmortal();
14933             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14934             regprop(RExC_rx, mysv, scan);
14935             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14936                 SvPV_nolen_const(mysv),
14937                 REG_NODE_NUM(scan),
14938                 PL_reg_name[exact]);
14939         });
14940         if (temp == NULL)
14941             break;
14942         scan = temp;
14943     }
14944     DEBUG_PARSE_r({
14945         SV * const mysv_val=sv_newmortal();
14946         DEBUG_PARSE_MSG("");
14947         regprop(RExC_rx, mysv_val, val);
14948         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14949                       SvPV_nolen_const(mysv_val),
14950                       (IV)REG_NODE_NUM(val),
14951                       (IV)(val - scan)
14952         );
14953     });
14954     if (reg_off_by_arg[OP(scan)]) {
14955         ARG_SET(scan, val - scan);
14956     }
14957     else {
14958         NEXT_OFF(scan) = val - scan;
14959     }
14960
14961     return exact;
14962 }
14963 #endif
14964
14965 /*
14966  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14967  */
14968 #ifdef DEBUGGING
14969
14970 static void
14971 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14972 {
14973     int bit;
14974     int set=0;
14975
14976     for (bit=0; bit<32; bit++) {
14977         if (flags & (1<<bit)) {
14978             if (!set++ && lead)
14979                 PerlIO_printf(Perl_debug_log, "%s",lead);
14980             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14981         }
14982     }
14983     if (lead)  {
14984         if (set)
14985             PerlIO_printf(Perl_debug_log, "\n");
14986         else
14987             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14988     }
14989 }
14990
14991 static void
14992 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14993 {
14994     int bit;
14995     int set=0;
14996     regex_charset cs;
14997
14998     for (bit=0; bit<32; bit++) {
14999         if (flags & (1<<bit)) {
15000             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15001                 continue;
15002             }
15003             if (!set++ && lead)
15004                 PerlIO_printf(Perl_debug_log, "%s",lead);
15005             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15006         }
15007     }
15008     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15009             if (!set++ && lead) {
15010                 PerlIO_printf(Perl_debug_log, "%s",lead);
15011             }
15012             switch (cs) {
15013                 case REGEX_UNICODE_CHARSET:
15014                     PerlIO_printf(Perl_debug_log, "UNICODE");
15015                     break;
15016                 case REGEX_LOCALE_CHARSET:
15017                     PerlIO_printf(Perl_debug_log, "LOCALE");
15018                     break;
15019                 case REGEX_ASCII_RESTRICTED_CHARSET:
15020                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15021                     break;
15022                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15023                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15024                     break;
15025                 default:
15026                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15027                     break;
15028             }
15029     }
15030     if (lead)  {
15031         if (set)
15032             PerlIO_printf(Perl_debug_log, "\n");
15033         else
15034             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15035     }
15036 }
15037 #endif
15038
15039 void
15040 Perl_regdump(pTHX_ const regexp *r)
15041 {
15042 #ifdef DEBUGGING
15043     dVAR;
15044     SV * const sv = sv_newmortal();
15045     SV *dsv= sv_newmortal();
15046     RXi_GET_DECL(r,ri);
15047     GET_RE_DEBUG_FLAGS_DECL;
15048
15049     PERL_ARGS_ASSERT_REGDUMP;
15050
15051     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15052
15053     /* Header fields of interest. */
15054     if (r->anchored_substr) {
15055         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15056             RE_SV_DUMPLEN(r->anchored_substr), 30);
15057         PerlIO_printf(Perl_debug_log,
15058                       "anchored %s%s at %"IVdf" ",
15059                       s, RE_SV_TAIL(r->anchored_substr),
15060                       (IV)r->anchored_offset);
15061     } else if (r->anchored_utf8) {
15062         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15063             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15064         PerlIO_printf(Perl_debug_log,
15065                       "anchored utf8 %s%s at %"IVdf" ",
15066                       s, RE_SV_TAIL(r->anchored_utf8),
15067                       (IV)r->anchored_offset);
15068     }
15069     if (r->float_substr) {
15070         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15071             RE_SV_DUMPLEN(r->float_substr), 30);
15072         PerlIO_printf(Perl_debug_log,
15073                       "floating %s%s at %"IVdf"..%"UVuf" ",
15074                       s, RE_SV_TAIL(r->float_substr),
15075                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15076     } else if (r->float_utf8) {
15077         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15078             RE_SV_DUMPLEN(r->float_utf8), 30);
15079         PerlIO_printf(Perl_debug_log,
15080                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15081                       s, RE_SV_TAIL(r->float_utf8),
15082                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15083     }
15084     if (r->check_substr || r->check_utf8)
15085         PerlIO_printf(Perl_debug_log,
15086                       (const char *)
15087                       (r->check_substr == r->float_substr
15088                        && r->check_utf8 == r->float_utf8
15089                        ? "(checking floating" : "(checking anchored"));
15090     if (r->extflags & RXf_NOSCAN)
15091         PerlIO_printf(Perl_debug_log, " noscan");
15092     if (r->extflags & RXf_CHECK_ALL)
15093         PerlIO_printf(Perl_debug_log, " isall");
15094     if (r->check_substr || r->check_utf8)
15095         PerlIO_printf(Perl_debug_log, ") ");
15096
15097     if (ri->regstclass) {
15098         regprop(r, sv, ri->regstclass);
15099         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15100     }
15101     if (r->extflags & RXf_ANCH) {
15102         PerlIO_printf(Perl_debug_log, "anchored");
15103         if (r->extflags & RXf_ANCH_BOL)
15104             PerlIO_printf(Perl_debug_log, "(BOL)");
15105         if (r->extflags & RXf_ANCH_MBOL)
15106             PerlIO_printf(Perl_debug_log, "(MBOL)");
15107         if (r->extflags & RXf_ANCH_SBOL)
15108             PerlIO_printf(Perl_debug_log, "(SBOL)");
15109         if (r->extflags & RXf_ANCH_GPOS)
15110             PerlIO_printf(Perl_debug_log, "(GPOS)");
15111         PerlIO_putc(Perl_debug_log, ' ');
15112     }
15113     if (r->extflags & RXf_GPOS_SEEN)
15114         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15115     if (r->intflags & PREGf_SKIP)
15116         PerlIO_printf(Perl_debug_log, "plus ");
15117     if (r->intflags & PREGf_IMPLICIT)
15118         PerlIO_printf(Perl_debug_log, "implicit ");
15119     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15120     if (r->extflags & RXf_EVAL_SEEN)
15121         PerlIO_printf(Perl_debug_log, "with eval ");
15122     PerlIO_printf(Perl_debug_log, "\n");
15123     DEBUG_FLAGS_r({
15124         regdump_extflags("r->extflags: ",r->extflags);
15125         regdump_intflags("r->intflags: ",r->intflags);
15126     });
15127 #else
15128     PERL_ARGS_ASSERT_REGDUMP;
15129     PERL_UNUSED_CONTEXT;
15130     PERL_UNUSED_ARG(r);
15131 #endif  /* DEBUGGING */
15132 }
15133
15134 /*
15135 - regprop - printable representation of opcode
15136 */
15137
15138 void
15139 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
15140 {
15141 #ifdef DEBUGGING
15142     dVAR;
15143     int k;
15144
15145     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15146     static const char * const anyofs[] = {
15147 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15148     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15149     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15150     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15151     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15152     || _CC_VERTSPACE != 16
15153   #error Need to adjust order of anyofs[]
15154 #endif
15155         "\\w",
15156         "\\W",
15157         "\\d",
15158         "\\D",
15159         "[:alpha:]",
15160         "[:^alpha:]",
15161         "[:lower:]",
15162         "[:^lower:]",
15163         "[:upper:]",
15164         "[:^upper:]",
15165         "[:punct:]",
15166         "[:^punct:]",
15167         "[:print:]",
15168         "[:^print:]",
15169         "[:alnum:]",
15170         "[:^alnum:]",
15171         "[:graph:]",
15172         "[:^graph:]",
15173         "[:cased:]",
15174         "[:^cased:]",
15175         "\\s",
15176         "\\S",
15177         "[:blank:]",
15178         "[:^blank:]",
15179         "[:xdigit:]",
15180         "[:^xdigit:]",
15181         "[:space:]",
15182         "[:^space:]",
15183         "[:cntrl:]",
15184         "[:^cntrl:]",
15185         "[:ascii:]",
15186         "[:^ascii:]",
15187         "\\v",
15188         "\\V"
15189     };
15190     RXi_GET_DECL(prog,progi);
15191     GET_RE_DEBUG_FLAGS_DECL;
15192
15193     PERL_ARGS_ASSERT_REGPROP;
15194
15195     sv_setpvs(sv, "");
15196
15197     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15198         /* It would be nice to FAIL() here, but this may be called from
15199            regexec.c, and it would be hard to supply pRExC_state. */
15200         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
15201     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15202
15203     k = PL_regkind[OP(o)];
15204
15205     if (k == EXACT) {
15206         sv_catpvs(sv, " ");
15207         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15208          * is a crude hack but it may be the best for now since
15209          * we have no flag "this EXACTish node was UTF-8"
15210          * --jhi */
15211         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15212                   PERL_PV_ESCAPE_UNI_DETECT |
15213                   PERL_PV_ESCAPE_NONASCII   |
15214                   PERL_PV_PRETTY_ELLIPSES   |
15215                   PERL_PV_PRETTY_LTGT       |
15216                   PERL_PV_PRETTY_NOCLEAR
15217                   );
15218     } else if (k == TRIE) {
15219         /* print the details of the trie in dumpuntil instead, as
15220          * progi->data isn't available here */
15221         const char op = OP(o);
15222         const U32 n = ARG(o);
15223         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15224                (reg_ac_data *)progi->data->data[n] :
15225                NULL;
15226         const reg_trie_data * const trie
15227             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15228
15229         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15230         DEBUG_TRIE_COMPILE_r(
15231             Perl_sv_catpvf(aTHX_ sv,
15232                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15233                 (UV)trie->startstate,
15234                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15235                 (UV)trie->wordcount,
15236                 (UV)trie->minlen,
15237                 (UV)trie->maxlen,
15238                 (UV)TRIE_CHARCOUNT(trie),
15239                 (UV)trie->uniquecharcount
15240             )
15241         );
15242         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15243             sv_catpvs(sv, "[");
15244             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15245                                                    ? ANYOF_BITMAP(o)
15246                                                    : TRIE_BITMAP(trie));
15247             sv_catpvs(sv, "]");
15248         }
15249
15250     } else if (k == CURLY) {
15251         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15252             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15253         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15254     }
15255     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15256         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15257     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
15258         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15259         if ( RXp_PAREN_NAMES(prog) ) {
15260             if ( k != REF || (OP(o) < NREF)) {
15261                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15262                 SV **name= av_fetch(list, ARG(o), 0 );
15263                 if (name)
15264                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15265             }
15266             else {
15267                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15268                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15269                 I32 *nums=(I32*)SvPVX(sv_dat);
15270                 SV **name= av_fetch(list, nums[0], 0 );
15271                 I32 n;
15272                 if (name) {
15273                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15274                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15275                                     (n ? "," : ""), (IV)nums[n]);
15276                     }
15277                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15278                 }
15279             }
15280         }
15281     } else if (k == GOSUB)
15282         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
15283     else if (k == VERB) {
15284         if (!o->flags)
15285             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15286                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15287     } else if (k == LOGICAL)
15288         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
15289     else if (k == ANYOF) {
15290         const U8 flags = ANYOF_FLAGS(o);
15291         int do_sep = 0;
15292
15293
15294         if (flags & ANYOF_LOCALE)
15295             sv_catpvs(sv, "{loc}");
15296         if (flags & ANYOF_LOC_FOLD)
15297             sv_catpvs(sv, "{i}");
15298         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15299         if (flags & ANYOF_INVERT)
15300             sv_catpvs(sv, "^");
15301
15302         /* output what the standard cp 0-255 bitmap matches */
15303         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15304
15305         /* output any special charclass tests (used entirely under use
15306          * locale) * */
15307         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15308             int i;
15309             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15310                 if (ANYOF_POSIXL_TEST(o,i)) {
15311                     sv_catpv(sv, anyofs[i]);
15312                     do_sep = 1;
15313                 }
15314             }
15315         }
15316
15317         if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
15318             || ANYOF_NONBITMAP(o))
15319         {
15320             if (do_sep) {
15321                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15322                 if (flags & ANYOF_INVERT)
15323                     /*make sure the invert info is in each */
15324                     sv_catpvs(sv, "^");
15325             }
15326
15327         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
15328             sv_catpvs(sv, "{non-utf8-latin1-all}");
15329         }
15330
15331         /* output information about the unicode matching */
15332         if (flags & ANYOF_ABOVE_LATIN1_ALL)
15333             sv_catpvs(sv, "{unicode_all}");
15334         else if (ANYOF_NONBITMAP(o)) {
15335             SV *lv; /* Set if there is something outside the bit map. */
15336             bool byte_output = FALSE;   /* If something in the bitmap has been
15337                                            output */
15338
15339             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15340                 sv_catpvs(sv, "{outside bitmap}");
15341             }
15342             else {
15343                 sv_catpvs(sv, "{utf8}");
15344             }
15345
15346             /* Get the stuff that wasn't in the bitmap */
15347             (void) regclass_swash(prog, o, FALSE, &lv, NULL);
15348             if (lv && lv != &PL_sv_undef) {
15349                 char *s = savesvpv(lv);
15350                 char * const origs = s;
15351
15352                 while (*s && *s != '\n')
15353                     s++;
15354
15355                 if (*s == '\n') {
15356                     const char * const t = ++s;
15357
15358                     if (byte_output) {
15359                         sv_catpvs(sv, " ");
15360                     }
15361
15362                     while (*s) {
15363                         if (*s == '\n') {
15364
15365                             /* Truncate very long output */
15366                             if (s - origs > 256) {
15367                                 Perl_sv_catpvf(aTHX_ sv,
15368                                                "%.*s...",
15369                                                (int) (s - origs - 1),
15370                                                t);
15371                                 goto out_dump;
15372                             }
15373                             *s = ' ';
15374                         }
15375                         else if (*s == '\t') {
15376                             *s = '-';
15377                         }
15378                         s++;
15379                     }
15380                     if (s[-1] == ' ')
15381                         s[-1] = 0;
15382
15383                     sv_catpv(sv, t);
15384                 }
15385
15386             out_dump:
15387
15388                 Safefree(origs);
15389                 SvREFCNT_dec_NN(lv);
15390             }
15391         }
15392         }
15393
15394         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15395     }
15396     else if (k == POSIXD || k == NPOSIXD) {
15397         U8 index = FLAGS(o) * 2;
15398         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
15399             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15400         }
15401         else {
15402             if (*anyofs[index] != '[')  {
15403                 sv_catpv(sv, "[");
15404             }
15405             sv_catpv(sv, anyofs[index]);
15406             if (*anyofs[index] != '[')  {
15407                 sv_catpv(sv, "]");
15408             }
15409         }
15410     }
15411     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15412         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15413 #else
15414     PERL_UNUSED_CONTEXT;
15415     PERL_UNUSED_ARG(sv);
15416     PERL_UNUSED_ARG(o);
15417     PERL_UNUSED_ARG(prog);
15418 #endif  /* DEBUGGING */
15419 }
15420
15421 SV *
15422 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15423 {                               /* Assume that RE_INTUIT is set */
15424     dVAR;
15425     struct regexp *const prog = ReANY(r);
15426     GET_RE_DEBUG_FLAGS_DECL;
15427
15428     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15429     PERL_UNUSED_CONTEXT;
15430
15431     DEBUG_COMPILE_r(
15432         {
15433             const char * const s = SvPV_nolen_const(prog->check_substr
15434                       ? prog->check_substr : prog->check_utf8);
15435
15436             if (!PL_colorset) reginitcolors();
15437             PerlIO_printf(Perl_debug_log,
15438                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15439                       PL_colors[4],
15440                       prog->check_substr ? "" : "utf8 ",
15441                       PL_colors[5],PL_colors[0],
15442                       s,
15443                       PL_colors[1],
15444                       (strlen(s) > 60 ? "..." : ""));
15445         } );
15446
15447     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15448 }
15449
15450 /*
15451    pregfree()
15452
15453    handles refcounting and freeing the perl core regexp structure. When
15454    it is necessary to actually free the structure the first thing it
15455    does is call the 'free' method of the regexp_engine associated to
15456    the regexp, allowing the handling of the void *pprivate; member
15457    first. (This routine is not overridable by extensions, which is why
15458    the extensions free is called first.)
15459
15460    See regdupe and regdupe_internal if you change anything here.
15461 */
15462 #ifndef PERL_IN_XSUB_RE
15463 void
15464 Perl_pregfree(pTHX_ REGEXP *r)
15465 {
15466     SvREFCNT_dec(r);
15467 }
15468
15469 void
15470 Perl_pregfree2(pTHX_ REGEXP *rx)
15471 {
15472     dVAR;
15473     struct regexp *const r = ReANY(rx);
15474     GET_RE_DEBUG_FLAGS_DECL;
15475
15476     PERL_ARGS_ASSERT_PREGFREE2;
15477
15478     if (r->mother_re) {
15479         ReREFCNT_dec(r->mother_re);
15480     } else {
15481         CALLREGFREE_PVT(rx); /* free the private data */
15482         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15483         Safefree(r->xpv_len_u.xpvlenu_pv);
15484     }
15485     if (r->substrs) {
15486         SvREFCNT_dec(r->anchored_substr);
15487         SvREFCNT_dec(r->anchored_utf8);
15488         SvREFCNT_dec(r->float_substr);
15489         SvREFCNT_dec(r->float_utf8);
15490         Safefree(r->substrs);
15491     }
15492     RX_MATCH_COPY_FREE(rx);
15493 #ifdef PERL_ANY_COW
15494     SvREFCNT_dec(r->saved_copy);
15495 #endif
15496     Safefree(r->offs);
15497     SvREFCNT_dec(r->qr_anoncv);
15498     rx->sv_u.svu_rx = 0;
15499 }
15500
15501 /*  reg_temp_copy()
15502
15503     This is a hacky workaround to the structural issue of match results
15504     being stored in the regexp structure which is in turn stored in
15505     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15506     could be PL_curpm in multiple contexts, and could require multiple
15507     result sets being associated with the pattern simultaneously, such
15508     as when doing a recursive match with (??{$qr})
15509
15510     The solution is to make a lightweight copy of the regexp structure
15511     when a qr// is returned from the code executed by (??{$qr}) this
15512     lightweight copy doesn't actually own any of its data except for
15513     the starp/end and the actual regexp structure itself.
15514
15515 */
15516
15517
15518 REGEXP *
15519 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15520 {
15521     struct regexp *ret;
15522     struct regexp *const r = ReANY(rx);
15523     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15524
15525     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15526
15527     if (!ret_x)
15528         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15529     else {
15530         SvOK_off((SV *)ret_x);
15531         if (islv) {
15532             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15533                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15534                made both spots point to the same regexp body.) */
15535             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15536             assert(!SvPVX(ret_x));
15537             ret_x->sv_u.svu_rx = temp->sv_any;
15538             temp->sv_any = NULL;
15539             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15540             SvREFCNT_dec_NN(temp);
15541             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15542                ing below will not set it. */
15543             SvCUR_set(ret_x, SvCUR(rx));
15544         }
15545     }
15546     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15547        sv_force_normal(sv) is called.  */
15548     SvFAKE_on(ret_x);
15549     ret = ReANY(ret_x);
15550
15551     SvFLAGS(ret_x) |= SvUTF8(rx);
15552     /* We share the same string buffer as the original regexp, on which we
15553        hold a reference count, incremented when mother_re is set below.
15554        The string pointer is copied here, being part of the regexp struct.
15555      */
15556     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15557            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15558     if (r->offs) {
15559         const I32 npar = r->nparens+1;
15560         Newx(ret->offs, npar, regexp_paren_pair);
15561         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15562     }
15563     if (r->substrs) {
15564         Newx(ret->substrs, 1, struct reg_substr_data);
15565         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15566
15567         SvREFCNT_inc_void(ret->anchored_substr);
15568         SvREFCNT_inc_void(ret->anchored_utf8);
15569         SvREFCNT_inc_void(ret->float_substr);
15570         SvREFCNT_inc_void(ret->float_utf8);
15571
15572         /* check_substr and check_utf8, if non-NULL, point to either their
15573            anchored or float namesakes, and don't hold a second reference.  */
15574     }
15575     RX_MATCH_COPIED_off(ret_x);
15576 #ifdef PERL_ANY_COW
15577     ret->saved_copy = NULL;
15578 #endif
15579     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15580     SvREFCNT_inc_void(ret->qr_anoncv);
15581
15582     return ret_x;
15583 }
15584 #endif
15585
15586 /* regfree_internal()
15587
15588    Free the private data in a regexp. This is overloadable by
15589    extensions. Perl takes care of the regexp structure in pregfree(),
15590    this covers the *pprivate pointer which technically perl doesn't
15591    know about, however of course we have to handle the
15592    regexp_internal structure when no extension is in use.
15593
15594    Note this is called before freeing anything in the regexp
15595    structure.
15596  */
15597
15598 void
15599 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15600 {
15601     dVAR;
15602     struct regexp *const r = ReANY(rx);
15603     RXi_GET_DECL(r,ri);
15604     GET_RE_DEBUG_FLAGS_DECL;
15605
15606     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15607
15608     DEBUG_COMPILE_r({
15609         if (!PL_colorset)
15610             reginitcolors();
15611         {
15612             SV *dsv= sv_newmortal();
15613             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15614                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15615             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15616                 PL_colors[4],PL_colors[5],s);
15617         }
15618     });
15619 #ifdef RE_TRACK_PATTERN_OFFSETS
15620     if (ri->u.offsets)
15621         Safefree(ri->u.offsets);             /* 20010421 MJD */
15622 #endif
15623     if (ri->code_blocks) {
15624         int n;
15625         for (n = 0; n < ri->num_code_blocks; n++)
15626             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15627         Safefree(ri->code_blocks);
15628     }
15629
15630     if (ri->data) {
15631         int n = ri->data->count;
15632
15633         while (--n >= 0) {
15634           /* If you add a ->what type here, update the comment in regcomp.h */
15635             switch (ri->data->what[n]) {
15636             case 'a':
15637             case 'r':
15638             case 's':
15639             case 'S':
15640             case 'u':
15641                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15642                 break;
15643             case 'f':
15644                 Safefree(ri->data->data[n]);
15645                 break;
15646             case 'l':
15647             case 'L':
15648                 break;
15649             case 'T':
15650                 { /* Aho Corasick add-on structure for a trie node.
15651                      Used in stclass optimization only */
15652                     U32 refcount;
15653                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15654                     OP_REFCNT_LOCK;
15655                     refcount = --aho->refcount;
15656                     OP_REFCNT_UNLOCK;
15657                     if ( !refcount ) {
15658                         PerlMemShared_free(aho->states);
15659                         PerlMemShared_free(aho->fail);
15660                          /* do this last!!!! */
15661                         PerlMemShared_free(ri->data->data[n]);
15662                         PerlMemShared_free(ri->regstclass);
15663                     }
15664                 }
15665                 break;
15666             case 't':
15667                 {
15668                     /* trie structure. */
15669                     U32 refcount;
15670                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15671                     OP_REFCNT_LOCK;
15672                     refcount = --trie->refcount;
15673                     OP_REFCNT_UNLOCK;
15674                     if ( !refcount ) {
15675                         PerlMemShared_free(trie->charmap);
15676                         PerlMemShared_free(trie->states);
15677                         PerlMemShared_free(trie->trans);
15678                         if (trie->bitmap)
15679                             PerlMemShared_free(trie->bitmap);
15680                         if (trie->jump)
15681                             PerlMemShared_free(trie->jump);
15682                         PerlMemShared_free(trie->wordinfo);
15683                         /* do this last!!!! */
15684                         PerlMemShared_free(ri->data->data[n]);
15685                     }
15686                 }
15687                 break;
15688             default:
15689                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15690             }
15691         }
15692         Safefree(ri->data->what);
15693         Safefree(ri->data);
15694     }
15695
15696     Safefree(ri);
15697 }
15698
15699 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15700 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15701 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15702
15703 /*
15704    re_dup - duplicate a regexp.
15705
15706    This routine is expected to clone a given regexp structure. It is only
15707    compiled under USE_ITHREADS.
15708
15709    After all of the core data stored in struct regexp is duplicated
15710    the regexp_engine.dupe method is used to copy any private data
15711    stored in the *pprivate pointer. This allows extensions to handle
15712    any duplication it needs to do.
15713
15714    See pregfree() and regfree_internal() if you change anything here.
15715 */
15716 #if defined(USE_ITHREADS)
15717 #ifndef PERL_IN_XSUB_RE
15718 void
15719 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15720 {
15721     dVAR;
15722     I32 npar;
15723     const struct regexp *r = ReANY(sstr);
15724     struct regexp *ret = ReANY(dstr);
15725
15726     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15727
15728     npar = r->nparens+1;
15729     Newx(ret->offs, npar, regexp_paren_pair);
15730     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15731
15732     if (ret->substrs) {
15733         /* Do it this way to avoid reading from *r after the StructCopy().
15734            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15735            cache, it doesn't matter.  */
15736         const bool anchored = r->check_substr
15737             ? r->check_substr == r->anchored_substr
15738             : r->check_utf8 == r->anchored_utf8;
15739         Newx(ret->substrs, 1, struct reg_substr_data);
15740         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15741
15742         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15743         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15744         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15745         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15746
15747         /* check_substr and check_utf8, if non-NULL, point to either their
15748            anchored or float namesakes, and don't hold a second reference.  */
15749
15750         if (ret->check_substr) {
15751             if (anchored) {
15752                 assert(r->check_utf8 == r->anchored_utf8);
15753                 ret->check_substr = ret->anchored_substr;
15754                 ret->check_utf8 = ret->anchored_utf8;
15755             } else {
15756                 assert(r->check_substr == r->float_substr);
15757                 assert(r->check_utf8 == r->float_utf8);
15758                 ret->check_substr = ret->float_substr;
15759                 ret->check_utf8 = ret->float_utf8;
15760             }
15761         } else if (ret->check_utf8) {
15762             if (anchored) {
15763                 ret->check_utf8 = ret->anchored_utf8;
15764             } else {
15765                 ret->check_utf8 = ret->float_utf8;
15766             }
15767         }
15768     }
15769
15770     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15771     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15772
15773     if (ret->pprivate)
15774         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15775
15776     if (RX_MATCH_COPIED(dstr))
15777         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15778     else
15779         ret->subbeg = NULL;
15780 #ifdef PERL_ANY_COW
15781     ret->saved_copy = NULL;
15782 #endif
15783
15784     /* Whether mother_re be set or no, we need to copy the string.  We
15785        cannot refrain from copying it when the storage points directly to
15786        our mother regexp, because that's
15787                1: a buffer in a different thread
15788                2: something we no longer hold a reference on
15789                so we need to copy it locally.  */
15790     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15791     ret->mother_re   = NULL;
15792 }
15793 #endif /* PERL_IN_XSUB_RE */
15794
15795 /*
15796    regdupe_internal()
15797
15798    This is the internal complement to regdupe() which is used to copy
15799    the structure pointed to by the *pprivate pointer in the regexp.
15800    This is the core version of the extension overridable cloning hook.
15801    The regexp structure being duplicated will be copied by perl prior
15802    to this and will be provided as the regexp *r argument, however
15803    with the /old/ structures pprivate pointer value. Thus this routine
15804    may override any copying normally done by perl.
15805
15806    It returns a pointer to the new regexp_internal structure.
15807 */
15808
15809 void *
15810 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15811 {
15812     dVAR;
15813     struct regexp *const r = ReANY(rx);
15814     regexp_internal *reti;
15815     int len;
15816     RXi_GET_DECL(r,ri);
15817
15818     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15819
15820     len = ProgLen(ri);
15821
15822     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15823     Copy(ri->program, reti->program, len+1, regnode);
15824
15825     reti->num_code_blocks = ri->num_code_blocks;
15826     if (ri->code_blocks) {
15827         int n;
15828         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15829                 struct reg_code_block);
15830         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15831                 struct reg_code_block);
15832         for (n = 0; n < ri->num_code_blocks; n++)
15833              reti->code_blocks[n].src_regex = (REGEXP*)
15834                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15835     }
15836     else
15837         reti->code_blocks = NULL;
15838
15839     reti->regstclass = NULL;
15840
15841     if (ri->data) {
15842         struct reg_data *d;
15843         const int count = ri->data->count;
15844         int i;
15845
15846         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15847                 char, struct reg_data);
15848         Newx(d->what, count, U8);
15849
15850         d->count = count;
15851         for (i = 0; i < count; i++) {
15852             d->what[i] = ri->data->what[i];
15853             switch (d->what[i]) {
15854                 /* see also regcomp.h and regfree_internal() */
15855             case 'a': /* actually an AV, but the dup function is identical.  */
15856             case 'r':
15857             case 's':
15858             case 'S':
15859             case 'u': /* actually an HV, but the dup function is identical.  */
15860                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15861                 break;
15862             case 'f':
15863                 /* This is cheating. */
15864                 Newx(d->data[i], 1, regnode_ssc);
15865                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
15866                 reti->regstclass = (regnode*)d->data[i];
15867                 break;
15868             case 'T':
15869                 /* Trie stclasses are readonly and can thus be shared
15870                  * without duplication. We free the stclass in pregfree
15871                  * when the corresponding reg_ac_data struct is freed.
15872                  */
15873                 reti->regstclass= ri->regstclass;
15874                 /* Fall through */
15875             case 't':
15876                 OP_REFCNT_LOCK;
15877                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15878                 OP_REFCNT_UNLOCK;
15879                 /* Fall through */
15880             case 'l':
15881             case 'L':
15882                 d->data[i] = ri->data->data[i];
15883                 break;
15884             default:
15885                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15886             }
15887         }
15888
15889         reti->data = d;
15890     }
15891     else
15892         reti->data = NULL;
15893
15894     reti->name_list_idx = ri->name_list_idx;
15895
15896 #ifdef RE_TRACK_PATTERN_OFFSETS
15897     if (ri->u.offsets) {
15898         Newx(reti->u.offsets, 2*len+1, U32);
15899         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15900     }
15901 #else
15902     SetProgLen(reti,len);
15903 #endif
15904
15905     return (void*)reti;
15906 }
15907
15908 #endif    /* USE_ITHREADS */
15909
15910 #ifndef PERL_IN_XSUB_RE
15911
15912 /*
15913  - regnext - dig the "next" pointer out of a node
15914  */
15915 regnode *
15916 Perl_regnext(pTHX_ regnode *p)
15917 {
15918     dVAR;
15919     I32 offset;
15920
15921     if (!p)
15922         return(NULL);
15923
15924     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15925         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15926     }
15927
15928     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15929     if (offset == 0)
15930         return(NULL);
15931
15932     return(p+offset);
15933 }
15934 #endif
15935
15936 STATIC void
15937 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
15938 {
15939     va_list args;
15940     STRLEN l1 = strlen(pat1);
15941     STRLEN l2 = strlen(pat2);
15942     char buf[512];
15943     SV *msv;
15944     const char *message;
15945
15946     PERL_ARGS_ASSERT_RE_CROAK2;
15947
15948     if (l1 > 510)
15949         l1 = 510;
15950     if (l1 + l2 > 510)
15951         l2 = 510 - l1;
15952     Copy(pat1, buf, l1 , char);
15953     Copy(pat2, buf + l1, l2 , char);
15954     buf[l1 + l2] = '\n';
15955     buf[l1 + l2 + 1] = '\0';
15956     va_start(args, pat2);
15957     msv = vmess(buf, &args);
15958     va_end(args);
15959     message = SvPV_const(msv,l1);
15960     if (l1 > 512)
15961         l1 = 512;
15962     Copy(message, buf, l1 , char);
15963     /* l1-1 to avoid \n */
15964     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
15965 }
15966
15967 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15968
15969 #ifndef PERL_IN_XSUB_RE
15970 void
15971 Perl_save_re_context(pTHX)
15972 {
15973     dVAR;
15974
15975     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15976     if (PL_curpm) {
15977         const REGEXP * const rx = PM_GETRE(PL_curpm);
15978         if (rx) {
15979             U32 i;
15980             for (i = 1; i <= RX_NPARENS(rx); i++) {
15981                 char digits[TYPE_CHARS(long)];
15982                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15983                 GV *const *const gvp
15984                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15985
15986                 if (gvp) {
15987                     GV * const gv = *gvp;
15988                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15989                         save_scalar(gv);
15990                 }
15991             }
15992         }
15993     }
15994 }
15995 #endif
15996
15997 #ifdef DEBUGGING
15998
15999 STATIC void
16000 S_put_byte(pTHX_ SV *sv, int c)
16001 {
16002     PERL_ARGS_ASSERT_PUT_BYTE;
16003
16004     /* Our definition of isPRINT() ignores locales, so only bytes that are
16005        not part of UTF-8 are considered printable. I assume that the same
16006        holds for UTF-EBCDIC.
16007        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
16008        which Wikipedia says:
16009
16010        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
16011        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
16012        identical, to the ASCII delete (DEL) or rubout control character. ...
16013        it is typically mapped to hexadecimal code 9F, in order to provide a
16014        unique character mapping in both directions)
16015
16016        So the old condition can be simplified to !isPRINT(c)  */
16017     if (!isPRINT(c)) {
16018         switch (c) {
16019             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16020             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16021             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16022             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16023             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16024
16025             default:
16026                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16027                 break;
16028         }
16029     }
16030     else {
16031         const char string = c;
16032         if (c == '-' || c == ']' || c == '\\' || c == '^')
16033             sv_catpvs(sv, "\\");
16034         sv_catpvn(sv, &string, 1);
16035     }
16036 }
16037
16038 STATIC bool
16039 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16040 {
16041     /* Appends to 'sv' a displayable version of the innards of the bracketed
16042      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16043      * output anything */
16044
16045     int i;
16046     int rangestart = -1;
16047     bool has_output_anything = FALSE;
16048
16049     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16050
16051     for (i = 0; i <= 256; i++) {
16052         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16053             if (rangestart == -1)
16054                 rangestart = i;
16055         } else if (rangestart != -1) {
16056             int j = i - 1;
16057             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
16058                 for (; rangestart < i; rangestart++)
16059                     put_byte(sv, rangestart);
16060             }
16061             else if (   j > 255
16062                      || ! isALPHANUMERIC(rangestart)
16063                      || ! isALPHANUMERIC(j)
16064                      || isDIGIT(rangestart) != isDIGIT(j)
16065                      || isUPPER(rangestart) != isUPPER(j)
16066                      || isLOWER(rangestart) != isLOWER(j)
16067
16068                         /* This final test should get optimized out except
16069                          * on EBCDIC platforms, where it causes ranges that
16070                          * cross discontinuities like i/j to be shown as hex
16071                          * instead of the misleading, e.g. H-K (since that
16072                          * range includes more than H, I, J, K). */
16073                      || (j - rangestart)
16074                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
16075             {
16076                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
16077                                rangestart,
16078                                (j < 256) ? j : 255);
16079             }
16080             else { /* Here, the ends of the range are both digits, or both
16081                       uppercase, or both lowercase; and there's no
16082                       discontinuity in the range (which could happen on EBCDIC
16083                       platforms) */
16084                 put_byte(sv, rangestart);
16085                 sv_catpvs(sv, "-");
16086                 put_byte(sv, j);
16087             }
16088             rangestart = -1;
16089             has_output_anything = TRUE;
16090         }
16091     }
16092
16093     return has_output_anything;
16094 }
16095
16096 #define CLEAR_OPTSTART \
16097     if (optstart) STMT_START { \
16098             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16099             optstart=NULL; \
16100     } STMT_END
16101
16102 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16103
16104 STATIC const regnode *
16105 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16106             const regnode *last, const regnode *plast,
16107             SV* sv, I32 indent, U32 depth)
16108 {
16109     dVAR;
16110     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16111     const regnode *next;
16112     const regnode *optstart= NULL;
16113
16114     RXi_GET_DECL(r,ri);
16115     GET_RE_DEBUG_FLAGS_DECL;
16116
16117     PERL_ARGS_ASSERT_DUMPUNTIL;
16118
16119 #ifdef DEBUG_DUMPUNTIL
16120     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16121         last ? last-start : 0,plast ? plast-start : 0);
16122 #endif
16123
16124     if (plast && plast < last)
16125         last= plast;
16126
16127     while (PL_regkind[op] != END && (!last || node < last)) {
16128         /* While that wasn't END last time... */
16129         NODE_ALIGN(node);
16130         op = OP(node);
16131         if (op == CLOSE || op == WHILEM)
16132             indent--;
16133         next = regnext((regnode *)node);
16134
16135         /* Where, what. */
16136         if (OP(node) == OPTIMIZED) {
16137             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16138                 optstart = node;
16139             else
16140                 goto after_print;
16141         } else
16142             CLEAR_OPTSTART;
16143
16144         regprop(r, sv, node);
16145         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16146                       (int)(2*indent + 1), "", SvPVX_const(sv));
16147
16148         if (OP(node) != OPTIMIZED) {
16149             if (next == NULL)           /* Next ptr. */
16150                 PerlIO_printf(Perl_debug_log, " (0)");
16151             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
16152                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16153             else
16154                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16155             (void)PerlIO_putc(Perl_debug_log, '\n');
16156         }
16157
16158       after_print:
16159         if (PL_regkind[(U8)op] == BRANCHJ) {
16160             assert(next);
16161             {
16162                 const regnode *nnode = (OP(next) == LONGJMP
16163                                        ? regnext((regnode *)next)
16164                                        : next);
16165                 if (last && nnode > last)
16166                     nnode = last;
16167                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16168             }
16169         }
16170         else if (PL_regkind[(U8)op] == BRANCH) {
16171             assert(next);
16172             DUMPUNTIL(NEXTOPER(node), next);
16173         }
16174         else if ( PL_regkind[(U8)op]  == TRIE ) {
16175             const regnode *this_trie = node;
16176             const char op = OP(node);
16177             const U32 n = ARG(node);
16178             const reg_ac_data * const ac = op>=AHOCORASICK ?
16179                (reg_ac_data *)ri->data->data[n] :
16180                NULL;
16181             const reg_trie_data * const trie =
16182                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16183 #ifdef DEBUGGING
16184             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16185 #endif
16186             const regnode *nextbranch= NULL;
16187             I32 word_idx;
16188             sv_setpvs(sv, "");
16189             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16190                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16191
16192                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16193                    (int)(2*(indent+3)), "",
16194                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
16195                             PL_colors[0], PL_colors[1],
16196                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
16197                             PERL_PV_PRETTY_ELLIPSES    |
16198                             PERL_PV_PRETTY_LTGT
16199                             )
16200                             : "???"
16201                 );
16202                 if (trie->jump) {
16203                     U16 dist= trie->jump[word_idx+1];
16204                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16205                                   (UV)((dist ? this_trie + dist : next) - start));
16206                     if (dist) {
16207                         if (!nextbranch)
16208                             nextbranch= this_trie + trie->jump[0];
16209                         DUMPUNTIL(this_trie + dist, nextbranch);
16210                     }
16211                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16212                         nextbranch= regnext((regnode *)nextbranch);
16213                 } else {
16214                     PerlIO_printf(Perl_debug_log, "\n");
16215                 }
16216             }
16217             if (last && next > last)
16218                 node= last;
16219             else
16220                 node= next;
16221         }
16222         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16223             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16224                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16225         }
16226         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16227             assert(next);
16228             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16229         }
16230         else if ( op == PLUS || op == STAR) {
16231             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16232         }
16233         else if (PL_regkind[(U8)op] == ANYOF) {
16234             /* arglen 1 + class block */
16235             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16236                     ? ANYOF_POSIXL_SKIP : ANYOF_SKIP);
16237             node = NEXTOPER(node);
16238         }
16239         else if (PL_regkind[(U8)op] == EXACT) {
16240             /* Literal string, where present. */
16241             node += NODE_SZ_STR(node) - 1;
16242             node = NEXTOPER(node);
16243         }
16244         else {
16245             node = NEXTOPER(node);
16246             node += regarglen[(U8)op];
16247         }
16248         if (op == CURLYX || op == OPEN)
16249             indent++;
16250     }
16251     CLEAR_OPTSTART;
16252 #ifdef DEBUG_DUMPUNTIL
16253     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16254 #endif
16255     return node;
16256 }
16257
16258 #endif  /* DEBUGGING */
16259
16260 /*
16261  * Local variables:
16262  * c-indentation-style: bsd
16263  * c-basic-offset: 4
16264  * indent-tabs-mode: nil
16265  * End:
16266  *
16267  * ex: set ts=8 sts=4 sw=4 et:
16268  */