regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107     U32         flags;                  /* RXf_* are we folding, multilining? */
 108     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object
 113                                            pprivate field */
 114     char        *start;                 /* Start of input for compile */
 115     char        *end;                   /* End of input for compile */
 116     char        *parse;                 /* Input-scan pointer. */
 117     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 118     regnode     *emit_start;            /* Start of emitted-code area */
 119     regnode     *emit_bound;            /* First regnode outside of the
 120                                            allocated space */
 121     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 122                                            implies compiling, so don't emit */
 123     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 124                                            large enough for the largest
 125                                            non-EXACTish node, so can use it as
 126                                            scratch in pass1 */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     SSize_t     size;                   /* Code size. */
 131     I32                npar;            /* Capture buffer count, (OPEN) plus
 132                                            one. ("par" 0 is the whole
 133                                            pattern)*/
 134     I32         nestroot;               /* root parens we are in - used by
 135                                            accept */
 136     I32         extralen;
 137     I32         seen_zerolen;
 138     regnode     **open_parens;          /* pointers to open parens */
 139     regnode     **close_parens;         /* pointers to close parens */
 140     regnode     *opend;                 /* END node in program */
 141     I32         utf8;           /* whether the pattern is utf8 or not */
 142     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 143                                 /* XXX use this for future optimisation of case
 144                                  * where pattern must be upgraded to utf8. */
 145     I32         uni_semantics;  /* If a d charset modifier should use unicode
 146                                    rules, even if the pattern is not in
 147                                    utf8 */
 148     HV          *paren_names;           /* Paren names */
 149
 150     regnode     **recurse;              /* Recurse regops */
 151     I32         recurse_count;          /* Number of recurse regops */
 152     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153                                            through */
 154     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155     I32         in_lookbehind;
 156     I32         contains_locale;
 157     I32         contains_i;
 158     I32         override_recoding;
 159     I32         in_multi_char_class;
 160     struct reg_code_block *code_blocks; /* positions of literal (?{})
 161                                             within pattern */
 162     int         num_code_blocks;        /* size of code_blocks[] */
 163     int         code_index;             /* next code_blocks[] slot */
 164     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166     char        *starttry;              /* -Dr: where regtry was called. */
 167 #define RExC_starttry   (pRExC_state->starttry)
 168 #endif
 169     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171     const char  *lastparse;
 172     I32         lastnum;
 173     AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse  (pRExC_state->lastparse)
 175 #define RExC_lastnum    (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags      (pRExC_state->flags)
 181 #define RExC_pm_flags   (pRExC_state->pm_flags)
 182 #define RExC_precomp    (pRExC_state->precomp)
 183 #define RExC_rx_sv      (pRExC_state->rx_sv)
 184 #define RExC_rx         (pRExC_state->rx)
 185 #define RExC_rxi        (pRExC_state->rxi)
 186 #define RExC_start      (pRExC_state->start)
 187 #define RExC_end        (pRExC_state->end)
 188 #define RExC_parse      (pRExC_state->parse)
 189 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 192                                                          others */
 193 #endif
 194 #define RExC_emit       (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar       (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen   (pRExC_state->extralen)
 206 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 207 #define RExC_utf8       (pRExC_state->utf8)
 208 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 210 #define RExC_open_parens        (pRExC_state->open_parens)
 211 #define RExC_close_parens       (pRExC_state->close_parens)
 212 #define RExC_opend      (pRExC_state->opend)
 213 #define RExC_paren_names        (pRExC_state->paren_names)
 214 #define RExC_recurse    (pRExC_state->recurse)
 215 #define RExC_recurse_count      (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218                                    (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale    (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228         ((*s) == '{' && regcurly(s, FALSE)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST           0       /* Worst case. */
 234 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE          0x02
 241 #define SPSTART         0x04    /* Starts with * or + */
 242 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8    STMT_START {                                       \
 264                                      if (!UTF) {                           \
 265                                          *flagp = RESTART_UTF8;            \
 266                                          return NULL;                      \
 267                                      }                                     \
 268                         } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290     /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298      /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305     This is the position the string must appear at, or not before.
 306     It also implicitly (when combined with minlenp) tells us how many
 307     characters must match before the string we are searching for.
 308     Likewise when combined with minlenp and the length of the string it
 309     tells us how many characters must appear after the string we have
 310     found.
 311
 312   - max_offset
 313     Only used for floating strings. This is the rightmost point that
 314     the string can appear at. If set to SSize_t_MAX it indicates that the
 315     string can occur infinitely far to the right.
 316
 317   - minlenp
 318     A pointer to the minimum number of characters of the pattern that the
 319     string was found inside. This is important as in the case of positive
 320     lookahead or positive lookbehind we can have multiple patterns
 321     involved. Consider
 322
 323     /(?=FOO).*F/
 324
 325     The minimum length of the pattern overall is 3, the minimum length
 326     of the lookahead part is 3, but the minimum length of the part that
 327     will actually match is 1. So 'FOO's minimum length is 3, but the
 328     minimum length for the F is 1. This is important as the minimum length
 329     is used to determine offsets in front of and behind the string being
 330     looked for.  Since strings can be composites this is the length of the
 331     pattern at the time it was committed with a scan_commit. Note that
 332     the length is calculated by study_chunk, so that the minimum lengths
 333     are not known until the full pattern has been compiled, thus the
 334     pointer to the value.
 335
 336   - lookbehind
 337
 338     In the case of lookbehind the string being searched for can be
 339     offset past the start point of the final matching string.
 340     If this value was just blithely removed from the min_offset it would
 341     invalidate some of the calculations for how many chars must match
 342     before or after (as they are derived from min_offset and minlen and
 343     the length of the string being searched for).
 344     When the final pattern is compiled and the data is moved from the
 345     scan_data_t structure into the regexp structure the information
 346     about lookbehind is factored in, with the information that would
 347     have been lost precalculated in the end_shift field for the
 348     associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356     /*I32 len_min;      unused */
 357     /*I32 len_delta;    unused */
 358     SSize_t pos_min;
 359     SSize_t pos_delta;
 360     SV *last_found;
 361     SSize_t last_end;       /* min value, <0 unless valid. */
 362     SSize_t last_start_min;
 363     SSize_t last_start_max;
 364     SV **longest;           /* Either &l_fixed, or &l_float. */
 365     SV *longest_fixed;      /* longest fixed string found in pattern */
 366     SSize_t offset_fixed;   /* offset where it starts */
 367     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369     SV *longest_float;      /* longest floating string found in pattern */
 370     SSize_t offset_float_min; /* earliest point in string it can appear */
 371     SSize_t offset_float_max; /* latest point in string it can appear */
 372     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374     I32 flags;
 375     I32 whilem_c;
 376     SSize_t *last_closep;
 377     regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL          0x0001
 407 #define SF_BEFORE_MEOL          0x0002
 408 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL        (+2)
 412 #define SF_FL_SHIFT_EOL         (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF               0x0040
 420 #define SF_HAS_PAR              0x0080
 421 #define SF_IN_PAR               0x0100
 422 #define SF_HAS_EVAL             0x0200
 423 #define SCF_DO_SUBSTR           0x0400
 424 #define SCF_DO_STCLASS_AND      0x0800
 425 #define SCF_DO_STCLASS_OR       0x1000
 426 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS  0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438                                                      == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441                                                      >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS          -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE             0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482                 UTF8fARG(UTF, offset, RExC_precomp), \
 483                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {                                        \
 491     const char *ellipses = "";                                          \
 492     IV len = RExC_end - RExC_precomp;                                   \
 493                                                                         \
 494     if (!SIZE_ONLY)                                                     \
 495         SAVEFREESV(RExC_rx_sv);                                         \
 496     if (len > RegexLengthToShowInErrorMessages) {                       \
 497         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 498         len = RegexLengthToShowInErrorMessages - 10;                    \
 499         ellipses = "...";                                               \
 500     }                                                                   \
 501     code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(                            \
 505     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 506             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(                       \
 509     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 510             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {                                    \
 516     const IV offset = RExC_parse - RExC_precomp;                        \
 517     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 518             m, REPORT_LOCATION_ARGS(offset));   \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {                           \
 525     if (!SIZE_ONLY)                                     \
 526         SAVEFREESV(RExC_rx_sv);                         \
 527     Simple_vFAIL(m);                                    \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 534     const IV offset = RExC_parse - RExC_precomp;                        \
 535     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 536                       REPORT_LOCATION_ARGS(offset));    \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {                       \
 543     if (!SIZE_ONLY)                                     \
 544         SAVEFREESV(RExC_rx_sv);                         \
 545     Simple_vFAIL2(m, a1);                               \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 553     const IV offset = RExC_parse - RExC_precomp;                \
 554     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 555             REPORT_LOCATION_ARGS(offset));      \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {                    \
 562     if (!SIZE_ONLY)                                     \
 563         SAVEFREESV(RExC_rx_sv);                         \
 564     Simple_vFAIL3(m, a1, a2);                           \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 571     const IV offset = RExC_parse - RExC_precomp;                \
 572     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 573             REPORT_LOCATION_ARGS(offset));      \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 577     if (!SIZE_ONLY)                                     \
 578         SAVEFREESV(RExC_rx_sv);                         \
 579     Simple_vFAIL4(m, a1, a2, a3);                       \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584     const IV offset = RExC_parse - RExC_precomp;   \
 585     if (!SIZE_ONLY)                                \
 586         SAVEFREESV(RExC_rx_sv);                    \
 587     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588             REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594     const IV offset = loc - RExC_precomp;                               \
 595     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596             m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {                                   \
 600     const IV offset = loc - RExC_precomp;                               \
 601     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 602             REPORT_LOCATION_ARGS(offset));              \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {                                  \
 606     const IV offset = loc - RExC_precomp;                               \
 607     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 608             REPORT_LOCATION_ARGS(offset));              \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {                                   \
 612     const IV offset = loc - RExC_precomp;                               \
 613     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 614             m REPORT_LOCATION,                                          \
 615             REPORT_LOCATION_ARGS(offset));              \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {                                \
 619     const IV offset = loc - RExC_precomp;                               \
 620     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 621             m REPORT_LOCATION,                                          \
 622             REPORT_LOCATION_ARGS(offset));              \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 626     const IV offset = loc - RExC_precomp;                               \
 627     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 628             m REPORT_LOCATION,                                          \
 629             a1, REPORT_LOCATION_ARGS(offset));  \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 635             a1, REPORT_LOCATION_ARGS(offset));  \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 639     const IV offset = loc - RExC_precomp;                               \
 640     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 641             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 645     const IV offset = loc - RExC_precomp;                               \
 646     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 647             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 651     const IV offset = loc - RExC_precomp;                               \
 652     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 653             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 657     const IV offset = loc - RExC_precomp;                               \
 658     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 659             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 663     const IV offset = loc - RExC_precomp;                               \
 664     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 665             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {                  \
 671     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 696     if (! SIZE_ONLY) {                                                  \
 697         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 698                     __LINE__, (int)(node), (int)(byte)));               \
 699         if((node) < 0) {                                                \
 700             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701                                          (int)(node));                  \
 702         } else {                                                        \
 703             RExC_offsets[2*(node)-1] = (byte);                          \
 704         }                                                               \
 705     }                                                                   \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 713     if (! SIZE_ONLY) {                                                  \
 714         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 715                 __LINE__, (int)(node), (int)(len)));                    \
 716         if((node) < 0) {                                                \
 717             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718                                          (int)(node));                  \
 719         } else {                                                        \
 720             RExC_offsets[2*(node)] = (len);                             \
 721         }                                                               \
 722     }                                                                   \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728     Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 735     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 736     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745         DEBUG_OPTIMISE_MORE_r({                                             \
 746             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                                                                             \
 748             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                                                                             \
 751             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                                                                             \
 754             if (RExC_seen & REG_GPOS_SEEN)                                  \
 755                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                                                                             \
 757             if (RExC_seen & REG_CANY_SEEN)                                  \
 758                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                                                                             \
 760             if (RExC_seen & REG_RECURSE_SEEN)                               \
 761                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                                                                             \
 763             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                                                                             \
 766             if (RExC_seen & REG_VERBARG_SEEN)                               \
 767                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                                                                             \
 769             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                                                                             \
 772             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                                                                             \
 775             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                                                                             \
 778             if (RExC_seen & REG_GOSTART_SEEN)                               \
 779                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                                                                             \
 781             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                                                                             \
 784             PerlIO_printf(Perl_debug_log,"\n");                             \
 785         });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789     PerlIO_printf(Perl_debug_log,                                    \
 790         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792         (int)(depth)*2, "",                                          \
 793         (IV)((data)->pos_min),                                       \
 794         (IV)((data)->pos_delta),                                     \
 795         (UV)((data)->flags),                                         \
 796         (IV)((data)->whilem_c),                                      \
 797         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798         is_inf ? "INF " : ""                                         \
 799     );                                                               \
 800     if ((data)->last_found)                                          \
 801         PerlIO_printf(Perl_debug_log,                                \
 802             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804             SvPVX_const((data)->last_found),                         \
 805             (IV)((data)->last_end),                                  \
 806             (IV)((data)->last_start_min),                            \
 807             (IV)((data)->last_start_max),                            \
 808             ((data)->longest &&                                      \
 809              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810             SvPVX_const((data)->longest_fixed),                      \
 811             (IV)((data)->offset_fixed),                              \
 812             ((data)->longest &&                                      \
 813              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814             SvPVX_const((data)->longest_float),                      \
 815             (IV)((data)->offset_float_min),                          \
 816             (IV)((data)->offset_float_max)                           \
 817         );                                                           \
 818     PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827                     SSize_t *minlenp, int is_inf)
 828 {
 829     const STRLEN l = CHR_SVLEN(data->last_found);
 830     const STRLEN old_l = CHR_SVLEN(*data->longest);
 831     GET_RE_DEBUG_FLAGS_DECL;
 832
 833     PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836         SvSetMagicSV(*data->longest, data->last_found);
 837         if (*data->longest == data->longest_fixed) {
 838             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839             if (data->flags & SF_BEFORE_EOL)
 840                 data->flags
 841                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842             else
 843                 data->flags &= ~SF_FIX_BEFORE_EOL;
 844             data->minlen_fixed=minlenp;
 845             data->lookbehind_fixed=0;
 846         }
 847         else { /* *data->longest == data->longest_float */
 848             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849             data->offset_float_max = (l
 850                                       ? data->last_start_max
 851                                       : (data->pos_delta == SSize_t_MAX
 852                                          ? SSize_t_MAX
 853                                          : data->pos_min + data->pos_delta));
 854             if (is_inf
 855                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856                 data->offset_float_max = SSize_t_MAX;
 857             if (data->flags & SF_BEFORE_EOL)
 858                 data->flags
 859                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860             else
 861                 data->flags &= ~SF_FL_BEFORE_EOL;
 862             data->minlen_float=minlenp;
 863             data->lookbehind_float=0;
 864         }
 865     }
 866     SvCUR_set(data->last_found, 0);
 867     {
 868         SV * const sv = data->last_found;
 869         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871             if (mg)
 872                 mg->mg_len = 0;
 873         }
 874     }
 875     data->last_end = -1;
 876     data->flags &= ~SF_BEFORE_EOL;
 877     DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886     /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888     PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890     assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 899 {
 900     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903      * in any way, so there's no point in using it */
 904
 905     UV start, end;
 906     bool ret;
 907
 908     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910     assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913         return FALSE;
 914     }
 915
 916     /* See if the list consists solely of the range 0 - Infinity */
 917     invlist_iterinit(ssc->invlist);
 918     ret = invlist_iternext(ssc->invlist, &start, &end)
 919           && start == 0
 920           && end == UV_MAX;
 921
 922     invlist_iterfinish(ssc->invlist);
 923
 924     if (ret) {
 925         return TRUE;
 926     }
 927
 928     /* If e.g., both \w and \W are set, matches everything */
 929     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930         int i;
 931         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933                 return TRUE;
 934             }
 935         }
 936     }
 937
 938     return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945      * string, any code point, or any posix class under locale */
 946
 947     PERL_ARGS_ASSERT_SSC_INIT;
 948
 949     Zero(ssc, 1, regnode_ssc);
 950     set_ANYOF_SYNTHETIC(ssc);
 951     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952     ssc_anything(ssc);
 953
 954     /* If any portion of the regex is to operate under locale rules,
 955      * initialization includes it.  The reason this isn't done for all regexes
 956      * is that the optimizer was written under the assumption that locale was
 957      * all-or-nothing.  Given the complexity and lack of documentation in the
 958      * optimizer, and that there are inadequate test cases for locale, many
 959      * parts of it may not work properly, it is safest to avoid locale unless
 960      * necessary. */
 961     if (RExC_contains_locale) {
 962         ANYOF_POSIXL_SETALL(ssc);
 963     }
 964     else {
 965         ANYOF_POSIXL_ZERO(ssc);
 966     }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 971                               const regnode_ssc *ssc)
 972 {
 973     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974      * to the list of code points matched, and locale posix classes; hence does
 975      * not check its flags) */
 976
 977     UV start, end;
 978     bool ret;
 979
 980     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982     assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984     invlist_iterinit(ssc->invlist);
 985     ret = invlist_iternext(ssc->invlist, &start, &end)
 986           && start == 0
 987           && end == UV_MAX;
 988
 989     invlist_iterfinish(ssc->invlist);
 990
 991     if (! ret) {
 992         return FALSE;
 993     }
 994
 995     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996         return FALSE;
 997     }
 998
 999     return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004                                const regnode_charclass* const node)
1005 {
1006     /* Returns a mortal inversion list defining which code points are matched
1007      * by 'node', which is of type ANYOF.  Handles complementing the result if
1008      * appropriate.  If some code points aren't knowable at this time, the
1009      * returned list must, and will, contain every code point that is a
1010      * possibility. */
1011
1012     SV* invlist = sv_2mortal(_new_invlist(0));
1013     SV* only_utf8_locale_invlist = NULL;
1014     unsigned int i;
1015     const U32 n = ARG(node);
1016     bool new_node_has_latin1 = FALSE;
1017
1018     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020     /* Look at the data structure created by S_set_ANYOF_arg() */
1021     if (n != ANYOF_NONBITMAP_EMPTY) {
1022         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023         AV * const av = MUTABLE_AV(SvRV(rv));
1024         SV **const ary = AvARRAY(av);
1025         assert(RExC_rxi->data->what[n] == 's');
1026
1027         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029         }
1030         else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032             /* Here, no compile-time swash, and there are things that won't be
1033              * known until runtime -- we have to assume it could be anything */
1034             return _add_range_to_invlist(invlist, 0, UV_MAX);
1035         }
1036         else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038             /* Here no compile-time swash, and no run-time only data.  Use the
1039              * node's inversion list */
1040             invlist = sv_2mortal(invlist_clone(ary[3]));
1041         }
1042
1043         /* Get the code points valid only under UTF-8 locales */
1044         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045             && ary[2] && ary[2] != &PL_sv_undef)
1046         {
1047             only_utf8_locale_invlist = ary[2];
1048         }
1049     }
1050
1051     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052      * inversion list for the others, but if there are code points that should
1053      * match only conditionally on the target string being UTF-8, those are
1054      * placed in the inversion list, and not the bitmap.  Since there are
1055      * circumstances under which they could match, they are included in the
1056      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057      * here, so that when we invert below, the end result actually does include
1058      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059      * before we add the unconditionally matched code points */
1060     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061         _invlist_intersection_complement_2nd(invlist,
1062                                              PL_UpperLatin1,
1063                                              &invlist);
1064     }
1065
1066     /* Add in the points from the bit map */
1067     for (i = 0; i < 256; i++) {
1068         if (ANYOF_BITMAP_TEST(node, i)) {
1069             invlist = add_cp_to_invlist(invlist, i);
1070             new_node_has_latin1 = TRUE;
1071         }
1072     }
1073
1074     /* If this can match all upper Latin1 code points, have to add them
1075      * as well */
1076     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078     }
1079
1080     /* Similarly for these */
1081     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083     }
1084
1085     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086         _invlist_invert(invlist);
1087     }
1088     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091          * locale.  We can skip this if there are no 0-255 at all. */
1092         _invlist_union(invlist, PL_Latin1, &invlist);
1093     }
1094
1095     /* Similarly add the UTF-8 locale possible matches.  These have to be
1096      * deferred until after the non-UTF-8 locale ones are taken care of just
1097      * above, or it leads to wrong results under ANYOF_INVERT */
1098     if (only_utf8_locale_invlist) {
1099         _invlist_union_maybe_complement_2nd(invlist,
1100                                             only_utf8_locale_invlist,
1101                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1102                                             &invlist);
1103     }
1104
1105     return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero           ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120                 const regnode_charclass *and_with)
1121 {
1122     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123      * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125     SV* anded_cp_list;
1126     U8  anded_flags;
1127
1128     PERL_ARGS_ASSERT_SSC_AND;
1129
1130     assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133      * the code point inversion list and just the relevant flags */
1134     if (is_ANYOF_SYNTHETIC(and_with)) {
1135         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136         anded_flags = ANYOF_FLAGS(and_with);
1137
1138         /* XXX This is a kludge around what appears to be deficiencies in the
1139          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140          * there are paths through the optimizer where it doesn't get weeded
1141          * out when it should.  And if we don't make some extra provision for
1142          * it like the code just below, it doesn't get added when it should.
1143          * This solution is to add it only when AND'ing, which is here, and
1144          * only when what is being AND'ed is the pristine, original node
1145          * matching anything.  Thus it is like adding it to ssc_anything() but
1146          * only when the result is to be AND'ed.  Probably the same solution
1147          * could be adopted for the same problem we have with /l matching,
1148          * which is solved differently in S_ssc_init(), and that would lead to
1149          * fewer false positives than that solution has.  But if this solution
1150          * creates bugs, the consequences are only that a warning isn't raised
1151          * that should be; while the consequences for having /l bugs is
1152          * incorrect matches */
1153         if (ssc_is_anything((regnode_ssc *)and_with)) {
1154             anded_flags |= ANYOF_WARN_SUPER;
1155         }
1156     }
1157     else {
1158         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160     }
1161
1162     ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166      * 'and_with' may be inverted.  When not inverted, we have the situation of
1167      * computing:
1168      *  (C1 | P1) & (C2 | P2)
1169      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172      *                    <=  ((C1 & C2) | P1 | P2)
1173      * Alternatively, the last few steps could be:
1174      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176      *                    <=  (C1 | C2 | (P1 & P2))
1177      * We favor the second approach if either P1 or P2 is non-empty.  This is
1178      * because these components are a barrier to doing optimizations, as what
1179      * they match cannot be known until the moment of matching as they are
1180      * dependent on the current locale, 'AND"ing them likely will reduce or
1181      * eliminate them.
1182      * But we can do better if we know that C1,P1 are in their initial state (a
1183      * frequent occurrence), each matching everything:
1184      *  (<everything>) & (C2 | P2) =  C2 | P2
1185      * Similarly, if C2,P2 are in their initial state (again a frequent
1186      * occurrence), the result is a no-op
1187      *  (C1 | P1) & (<everything>) =  C1 | P1
1188      *
1189      * Inverted, we have
1190      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193      * */
1194
1195     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196         && ! is_ANYOF_SYNTHETIC(and_with))
1197     {
1198         unsigned int i;
1199
1200         ssc_intersection(ssc,
1201                          anded_cp_list,
1202                          FALSE /* Has already been inverted */
1203                          );
1204
1205         /* If either P1 or P2 is empty, the intersection will be also; can skip
1206          * the loop */
1207         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208             ANYOF_POSIXL_ZERO(ssc);
1209         }
1210         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212             /* Note that the Posix class component P from 'and_with' actually
1213              * looks like:
1214              *      P = Pa | Pb | ... | Pn
1215              * where each component is one posix class, such as in [\w\s].
1216              * Thus
1217              *      ~P = ~(Pa | Pb | ... | Pn)
1218              *         = ~Pa & ~Pb & ... & ~Pn
1219              *        <= ~Pa | ~Pb | ... | ~Pn
1220              * The last is something we can easily calculate, but unfortunately
1221              * is likely to have many false positives.  We could do better
1222              * in some (but certainly not all) instances if two classes in
1223              * P have known relationships.  For example
1224              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225              * So
1226              *      :lower: & :print: = :lower:
1227              * And similarly for classes that must be disjoint.  For example,
1228              * since \s and \w can have no elements in common based on rules in
1229              * the POSIX standard,
1230              *      \w & ^\S = nothing
1231              * Unfortunately, some vendor locales do not meet the Posix
1232              * standard, in particular almost everything by Microsoft.
1233              * The loop below just changes e.g., \w into \W and vice versa */
1234
1235             regnode_charclass_posixl temp;
1236             int add = 1;    /* To calculate the index of the complement */
1237
1238             ANYOF_POSIXL_ZERO(&temp);
1239             for (i = 0; i < ANYOF_MAX; i++) {
1240                 assert(i % 2 != 0
1241                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245                     ANYOF_POSIXL_SET(&temp, i + add);
1246                 }
1247                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248             }
1249             ANYOF_POSIXL_AND(&temp, ssc);
1250
1251         } /* else ssc already has no posixes */
1252     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253          in its initial state */
1254     else if (! is_ANYOF_SYNTHETIC(and_with)
1255              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256     {
1257         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258          * copy it over 'ssc' */
1259         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260             if (is_ANYOF_SYNTHETIC(and_with)) {
1261                 StructCopy(and_with, ssc, regnode_ssc);
1262             }
1263             else {
1264                 ssc->invlist = anded_cp_list;
1265                 ANYOF_POSIXL_ZERO(ssc);
1266                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268                 }
1269             }
1270         }
1271         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273         {
1274             /* One or the other of P1, P2 is non-empty. */
1275             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277             }
1278             ssc_union(ssc, anded_cp_list, FALSE);
1279         }
1280         else { /* P1 = P2 = empty */
1281             ssc_intersection(ssc, anded_cp_list, FALSE);
1282         }
1283     }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288                const regnode_charclass *or_with)
1289 {
1290     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291      * another SSC or a regular ANYOF class.  Can create false positives if
1292      * 'or_with' is to be inverted. */
1293
1294     SV* ored_cp_list;
1295     U8 ored_flags;
1296
1297     PERL_ARGS_ASSERT_SSC_OR;
1298
1299     assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302      * the code point inversion list and just the relevant flags */
1303     if (is_ANYOF_SYNTHETIC(or_with)) {
1304         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305         ored_flags = ANYOF_FLAGS(or_with);
1306     }
1307     else {
1308         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310     }
1311
1312     ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316      * 'or_with' may be inverted.  When not inverted, we have the simple
1317      * situation of computing:
1318      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319      * If P1|P2 yields a situation with both a class and its complement are
1320      * set, like having both \w and \W, this matches all code points, and we
1321      * can delete these from the P component of the ssc going forward.  XXX We
1322      * might be able to delete all the P components, but I (khw) am not certain
1323      * about this, and it is better to be safe.
1324      *
1325      * Inverted, we have
1326      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327      *                         <=  (C1 | P1) | ~C2
1328      *                         <=  (C1 | ~C2) | P1
1329      * (which results in actually simpler code than the non-inverted case)
1330      * */
1331
1332     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333         && ! is_ANYOF_SYNTHETIC(or_with))
1334     {
1335         /* We ignore P2, leaving P1 going forward */
1336     }   /* else  Not inverted */
1337     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340             unsigned int i;
1341             for (i = 0; i < ANYOF_MAX; i += 2) {
1342                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343                 {
1344                     ssc_match_all_cp(ssc);
1345                     ANYOF_POSIXL_CLEAR(ssc, i);
1346                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1347                 }
1348             }
1349         }
1350     }
1351
1352     ssc_union(ssc,
1353               ored_cp_list,
1354               FALSE /* Already has been inverted */
1355               );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361     PERL_ARGS_ASSERT_SSC_UNION;
1362
1363     assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365     _invlist_union_maybe_complement_2nd(ssc->invlist,
1366                                         invlist,
1367                                         invert2nd,
1368                                         &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373                          SV* const invlist,
1374                          const bool invert2nd)
1375 {
1376     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378     assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381                                                invlist,
1382                                                invert2nd,
1383                                                &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391     assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399     /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401     SV* cp_list = _new_invlist(2);
1402
1403     PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405     assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407     cp_list = add_cp_to_invlist(cp_list, cp);
1408     ssc_intersection(ssc, cp_list,
1409                      FALSE /* Not inverted */
1410                      );
1411     SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1416 {
1417     /* Set the SSC 'ssc' to not match any locale things */
1418
1419     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1420
1421     assert(is_ANYOF_SYNTHETIC(ssc));
1422
1423     ANYOF_POSIXL_ZERO(ssc);
1424     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1425 }
1426
1427 STATIC void
1428 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1429 {
1430     /* The inversion list in the SSC is marked mortal; now we need a more
1431      * permanent copy, which is stored the same way that is done in a regular
1432      * ANYOF node, with the first 256 code points in a bit map */
1433
1434     SV* invlist = invlist_clone(ssc->invlist);
1435
1436     PERL_ARGS_ASSERT_SSC_FINALIZE;
1437
1438     assert(is_ANYOF_SYNTHETIC(ssc));
1439
1440     /* The code in this file assumes that all but these flags aren't relevant
1441      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1442      * time we reach here */
1443     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1444
1445     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1446
1447     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1448                                 NULL, NULL, NULL, FALSE);
1449
1450     /* Make sure is clone-safe */
1451     ssc->invlist = NULL;
1452
1453     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1454         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1455     }
1456
1457     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1458 }
1459
1460 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1461 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1462 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1463 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1464                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1465                                : 0 )
1466
1467
1468 #ifdef DEBUGGING
1469 /*
1470    dump_trie(trie,widecharmap,revcharmap)
1471    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1472    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1473
1474    These routines dump out a trie in a somewhat readable format.
1475    The _interim_ variants are used for debugging the interim
1476    tables that are used to generate the final compressed
1477    representation which is what dump_trie expects.
1478
1479    Part of the reason for their existence is to provide a form
1480    of documentation as to how the different representations function.
1481
1482 */
1483
1484 /*
1485   Dumps the final compressed table form of the trie to Perl_debug_log.
1486   Used for debugging make_trie().
1487 */
1488
1489 STATIC void
1490 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1491             AV *revcharmap, U32 depth)
1492 {
1493     U32 state;
1494     SV *sv=sv_newmortal();
1495     int colwidth= widecharmap ? 6 : 4;
1496     U16 word;
1497     GET_RE_DEBUG_FLAGS_DECL;
1498
1499     PERL_ARGS_ASSERT_DUMP_TRIE;
1500
1501     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1502         (int)depth * 2 + 2,"",
1503         "Match","Base","Ofs" );
1504
1505     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1506         SV ** const tmp = av_fetch( revcharmap, state, 0);
1507         if ( tmp ) {
1508             PerlIO_printf( Perl_debug_log, "%*s",
1509                 colwidth,
1510                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1511                             PL_colors[0], PL_colors[1],
1512                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1513                             PERL_PV_ESCAPE_FIRSTCHAR
1514                 )
1515             );
1516         }
1517     }
1518     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1519         (int)depth * 2 + 2,"");
1520
1521     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1522         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1523     PerlIO_printf( Perl_debug_log, "\n");
1524
1525     for( state = 1 ; state < trie->statecount ; state++ ) {
1526         const U32 base = trie->states[ state ].trans.base;
1527
1528         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1529                                        (int)depth * 2 + 2,"", (UV)state);
1530
1531         if ( trie->states[ state ].wordnum ) {
1532             PerlIO_printf( Perl_debug_log, " W%4X",
1533                                            trie->states[ state ].wordnum );
1534         } else {
1535             PerlIO_printf( Perl_debug_log, "%6s", "" );
1536         }
1537
1538         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1539
1540         if ( base ) {
1541             U32 ofs = 0;
1542
1543             while( ( base + ofs  < trie->uniquecharcount ) ||
1544                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1545                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1546                                                                     != state))
1547                     ofs++;
1548
1549             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1550
1551             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1552                 if ( ( base + ofs >= trie->uniquecharcount )
1553                         && ( base + ofs - trie->uniquecharcount
1554                                                         < trie->lasttrans )
1555                         && trie->trans[ base + ofs
1556                                     - trie->uniquecharcount ].check == state )
1557                 {
1558                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1559                     colwidth,
1560                     (UV)trie->trans[ base + ofs
1561                                              - trie->uniquecharcount ].next );
1562                 } else {
1563                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1564                 }
1565             }
1566
1567             PerlIO_printf( Perl_debug_log, "]");
1568
1569         }
1570         PerlIO_printf( Perl_debug_log, "\n" );
1571     }
1572     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1573                                 (int)depth*2, "");
1574     for (word=1; word <= trie->wordcount; word++) {
1575         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1576             (int)word, (int)(trie->wordinfo[word].prev),
1577             (int)(trie->wordinfo[word].len));
1578     }
1579     PerlIO_printf(Perl_debug_log, "\n" );
1580 }
1581 /*
1582   Dumps a fully constructed but uncompressed trie in list form.
1583   List tries normally only are used for construction when the number of
1584   possible chars (trie->uniquecharcount) is very high.
1585   Used for debugging make_trie().
1586 */
1587 STATIC void
1588 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1589                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1590                          U32 depth)
1591 {
1592     U32 state;
1593     SV *sv=sv_newmortal();
1594     int colwidth= widecharmap ? 6 : 4;
1595     GET_RE_DEBUG_FLAGS_DECL;
1596
1597     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1598
1599     /* print out the table precompression.  */
1600     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1601         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1602         "------:-----+-----------------\n" );
1603
1604     for( state=1 ; state < next_alloc ; state ++ ) {
1605         U16 charid;
1606
1607         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1608             (int)depth * 2 + 2,"", (UV)state  );
1609         if ( ! trie->states[ state ].wordnum ) {
1610             PerlIO_printf( Perl_debug_log, "%5s| ","");
1611         } else {
1612             PerlIO_printf( Perl_debug_log, "W%4x| ",
1613                 trie->states[ state ].wordnum
1614             );
1615         }
1616         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1617             SV ** const tmp = av_fetch( revcharmap,
1618                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1619             if ( tmp ) {
1620                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1621                     colwidth,
1622                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1623                               colwidth,
1624                               PL_colors[0], PL_colors[1],
1625                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1626                               | PERL_PV_ESCAPE_FIRSTCHAR
1627                     ) ,
1628                     TRIE_LIST_ITEM(state,charid).forid,
1629                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1630                 );
1631                 if (!(charid % 10))
1632                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1633                         (int)((depth * 2) + 14), "");
1634             }
1635         }
1636         PerlIO_printf( Perl_debug_log, "\n");
1637     }
1638 }
1639
1640 /*
1641   Dumps a fully constructed but uncompressed trie in table form.
1642   This is the normal DFA style state transition table, with a few
1643   twists to facilitate compression later.
1644   Used for debugging make_trie().
1645 */
1646 STATIC void
1647 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1648                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1649                           U32 depth)
1650 {
1651     U32 state;
1652     U16 charid;
1653     SV *sv=sv_newmortal();
1654     int colwidth= widecharmap ? 6 : 4;
1655     GET_RE_DEBUG_FLAGS_DECL;
1656
1657     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1658
1659     /*
1660        print out the table precompression so that we can do a visual check
1661        that they are identical.
1662      */
1663
1664     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1665
1666     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1667         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1668         if ( tmp ) {
1669             PerlIO_printf( Perl_debug_log, "%*s",
1670                 colwidth,
1671                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1672                             PL_colors[0], PL_colors[1],
1673                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1674                             PERL_PV_ESCAPE_FIRSTCHAR
1675                 )
1676             );
1677         }
1678     }
1679
1680     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1681
1682     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1683         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1684     }
1685
1686     PerlIO_printf( Perl_debug_log, "\n" );
1687
1688     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1689
1690         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1691             (int)depth * 2 + 2,"",
1692             (UV)TRIE_NODENUM( state ) );
1693
1694         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1695             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1696             if (v)
1697                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1698             else
1699                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1700         }
1701         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1702             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1703                                             (UV)trie->trans[ state ].check );
1704         } else {
1705             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1706                                             (UV)trie->trans[ state ].check,
1707             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1708         }
1709     }
1710 }
1711
1712 #endif
1713
1714
1715 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1716   startbranch: the first branch in the whole branch sequence
1717   first      : start branch of sequence of branch-exact nodes.
1718                May be the same as startbranch
1719   last       : Thing following the last branch.
1720                May be the same as tail.
1721   tail       : item following the branch sequence
1722   count      : words in the sequence
1723   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1724   depth      : indent depth
1725
1726 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1727
1728 A trie is an N'ary tree where the branches are determined by digital
1729 decomposition of the key. IE, at the root node you look up the 1st character and
1730 follow that branch repeat until you find the end of the branches. Nodes can be
1731 marked as "accepting" meaning they represent a complete word. Eg:
1732
1733   /he|she|his|hers/
1734
1735 would convert into the following structure. Numbers represent states, letters
1736 following numbers represent valid transitions on the letter from that state, if
1737 the number is in square brackets it represents an accepting state, otherwise it
1738 will be in parenthesis.
1739
1740       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1741       |    |
1742       |   (2)
1743       |    |
1744      (1)   +-i->(6)-+-s->[7]
1745       |
1746       +-s->(3)-+-h->(4)-+-e->[5]
1747
1748       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1749
1750 This shows that when matching against the string 'hers' we will begin at state 1
1751 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1752 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1753 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1754 single traverse. We store a mapping from accepting to state to which word was
1755 matched, and then when we have multiple possibilities we try to complete the
1756 rest of the regex in the order in which they occured in the alternation.
1757
1758 The only prior NFA like behaviour that would be changed by the TRIE support is
1759 the silent ignoring of duplicate alternations which are of the form:
1760
1761  / (DUPE|DUPE) X? (?{ ... }) Y /x
1762
1763 Thus EVAL blocks following a trie may be called a different number of times with
1764 and without the optimisation. With the optimisations dupes will be silently
1765 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1766 the following demonstrates:
1767
1768  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1769
1770 which prints out 'word' three times, but
1771
1772  'words'=~/(word|word|word)(?{ print $1 })S/
1773
1774 which doesnt print it out at all. This is due to other optimisations kicking in.
1775
1776 Example of what happens on a structural level:
1777
1778 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1779
1780    1: CURLYM[1] {1,32767}(18)
1781    5:   BRANCH(8)
1782    6:     EXACT <ac>(16)
1783    8:   BRANCH(11)
1784    9:     EXACT <ad>(16)
1785   11:   BRANCH(14)
1786   12:     EXACT <ab>(16)
1787   16:   SUCCEED(0)
1788   17:   NOTHING(18)
1789   18: END(0)
1790
1791 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1792 and should turn into:
1793
1794    1: CURLYM[1] {1,32767}(18)
1795    5:   TRIE(16)
1796         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1797           <ac>
1798           <ad>
1799           <ab>
1800   16:   SUCCEED(0)
1801   17:   NOTHING(18)
1802   18: END(0)
1803
1804 Cases where tail != last would be like /(?foo|bar)baz/:
1805
1806    1: BRANCH(4)
1807    2:   EXACT <foo>(8)
1808    4: BRANCH(7)
1809    5:   EXACT <bar>(8)
1810    7: TAIL(8)
1811    8: EXACT <baz>(10)
1812   10: END(0)
1813
1814 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1815 and would end up looking like:
1816
1817     1: TRIE(8)
1818       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1819         <foo>
1820         <bar>
1821    7: TAIL(8)
1822    8: EXACT <baz>(10)
1823   10: END(0)
1824
1825     d = uvchr_to_utf8_flags(d, uv, 0);
1826
1827 is the recommended Unicode-aware way of saying
1828
1829     *(d++) = uv;
1830 */
1831
1832 #define TRIE_STORE_REVCHAR(val)                                            \
1833     STMT_START {                                                           \
1834         if (UTF) {                                                         \
1835             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1836             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1837             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1838             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1839             SvPOK_on(zlopp);                                               \
1840             SvUTF8_on(zlopp);                                              \
1841             av_push(revcharmap, zlopp);                                    \
1842         } else {                                                           \
1843             char ooooff = (char)val;                                           \
1844             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1845         }                                                                  \
1846         } STMT_END
1847
1848 /* This gets the next character from the input, folding it if not already
1849  * folded. */
1850 #define TRIE_READ_CHAR STMT_START {                                           \
1851     wordlen++;                                                                \
1852     if ( UTF ) {                                                              \
1853         /* if it is UTF then it is either already folded, or does not need    \
1854          * folding */                                                         \
1855         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1856     }                                                                         \
1857     else if (folder == PL_fold_latin1) {                                      \
1858         /* This folder implies Unicode rules, which in the range expressible  \
1859          *  by not UTF is the lower case, with the two exceptions, one of     \
1860          *  which should have been taken care of before calling this */       \
1861         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1862         uvc = toLOWER_L1(*uc);                                                \
1863         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1864         len = 1;                                                              \
1865     } else {                                                                  \
1866         /* raw data, will be folded later if needed */                        \
1867         uvc = (U32)*uc;                                                       \
1868         len = 1;                                                              \
1869     }                                                                         \
1870 } STMT_END
1871
1872
1873
1874 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1875     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1876         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1877         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1878     }                                                           \
1879     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1880     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1881     TRIE_LIST_CUR( state )++;                                   \
1882 } STMT_END
1883
1884 #define TRIE_LIST_NEW(state) STMT_START {                       \
1885     Newxz( trie->states[ state ].trans.list,               \
1886         4, reg_trie_trans_le );                                 \
1887      TRIE_LIST_CUR( state ) = 1;                                \
1888      TRIE_LIST_LEN( state ) = 4;                                \
1889 } STMT_END
1890
1891 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1892     U16 dupe= trie->states[ state ].wordnum;                    \
1893     regnode * const noper_next = regnext( noper );              \
1894                                                                 \
1895     DEBUG_r({                                                   \
1896         /* store the word for dumping */                        \
1897         SV* tmp;                                                \
1898         if (OP(noper) != NOTHING)                               \
1899             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1900         else                                                    \
1901             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1902         av_push( trie_words, tmp );                             \
1903     });                                                         \
1904                                                                 \
1905     curword++;                                                  \
1906     trie->wordinfo[curword].prev   = 0;                         \
1907     trie->wordinfo[curword].len    = wordlen;                   \
1908     trie->wordinfo[curword].accept = state;                     \
1909                                                                 \
1910     if ( noper_next < tail ) {                                  \
1911         if (!trie->jump)                                        \
1912             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1913                                                  sizeof(U16) ); \
1914         trie->jump[curword] = (U16)(noper_next - convert);      \
1915         if (!jumper)                                            \
1916             jumper = noper_next;                                \
1917         if (!nextbranch)                                        \
1918             nextbranch= regnext(cur);                           \
1919     }                                                           \
1920                                                                 \
1921     if ( dupe ) {                                               \
1922         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1923         /* chain, so that when the bits of chain are later    */\
1924         /* linked together, the dups appear in the chain      */\
1925         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1926         trie->wordinfo[dupe].prev = curword;                    \
1927     } else {                                                    \
1928         /* we haven't inserted this word yet.                */ \
1929         trie->states[ state ].wordnum = curword;                \
1930     }                                                           \
1931 } STMT_END
1932
1933
1934 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1935      ( ( base + charid >=  ucharcount                                   \
1936          && base + charid < ubound                                      \
1937          && state == trie->trans[ base - ucharcount + charid ].check    \
1938          && trie->trans[ base - ucharcount + charid ].next )            \
1939            ? trie->trans[ base - ucharcount + charid ].next             \
1940            : ( state==1 ? special : 0 )                                 \
1941       )
1942
1943 #define MADE_TRIE       1
1944 #define MADE_JUMP_TRIE  2
1945 #define MADE_EXACT_TRIE 4
1946
1947 STATIC I32
1948 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1949                   regnode *first, regnode *last, regnode *tail,
1950                   U32 word_count, U32 flags, U32 depth)
1951 {
1952     dVAR;
1953     /* first pass, loop through and scan words */
1954     reg_trie_data *trie;
1955     HV *widecharmap = NULL;
1956     AV *revcharmap = newAV();
1957     regnode *cur;
1958     STRLEN len = 0;
1959     UV uvc = 0;
1960     U16 curword = 0;
1961     U32 next_alloc = 0;
1962     regnode *jumper = NULL;
1963     regnode *nextbranch = NULL;
1964     regnode *convert = NULL;
1965     U32 *prev_states; /* temp array mapping each state to previous one */
1966     /* we just use folder as a flag in utf8 */
1967     const U8 * folder = NULL;
1968
1969 #ifdef DEBUGGING
1970     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1971     AV *trie_words = NULL;
1972     /* along with revcharmap, this only used during construction but both are
1973      * useful during debugging so we store them in the struct when debugging.
1974      */
1975 #else
1976     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1977     STRLEN trie_charcount=0;
1978 #endif
1979     SV *re_trie_maxbuff;
1980     GET_RE_DEBUG_FLAGS_DECL;
1981
1982     PERL_ARGS_ASSERT_MAKE_TRIE;
1983 #ifndef DEBUGGING
1984     PERL_UNUSED_ARG(depth);
1985 #endif
1986
1987     switch (flags) {
1988         case EXACT: break;
1989         case EXACTFA:
1990         case EXACTFU_SS:
1991         case EXACTFU: folder = PL_fold_latin1; break;
1992         case EXACTF:  folder = PL_fold; break;
1993         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1994     }
1995
1996     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1997     trie->refcount = 1;
1998     trie->startstate = 1;
1999     trie->wordcount = word_count;
2000     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2001     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2002     if (flags == EXACT)
2003         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2004     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2005                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2006
2007     DEBUG_r({
2008         trie_words = newAV();
2009     });
2010
2011     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2012     assert(re_trie_maxbuff);
2013     if (!SvIOK(re_trie_maxbuff)) {
2014         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015     }
2016     DEBUG_TRIE_COMPILE_r({
2017         PerlIO_printf( Perl_debug_log,
2018           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2019           (int)depth * 2 + 2, "",
2020           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2021           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2022     });
2023
2024    /* Find the node we are going to overwrite */
2025     if ( first == startbranch && OP( last ) != BRANCH ) {
2026         /* whole branch chain */
2027         convert = first;
2028     } else {
2029         /* branch sub-chain */
2030         convert = NEXTOPER( first );
2031     }
2032
2033     /*  -- First loop and Setup --
2034
2035        We first traverse the branches and scan each word to determine if it
2036        contains widechars, and how many unique chars there are, this is
2037        important as we have to build a table with at least as many columns as we
2038        have unique chars.
2039
2040        We use an array of integers to represent the character codes 0..255
2041        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2042        the native representation of the character value as the key and IV's for
2043        the coded index.
2044
2045        *TODO* If we keep track of how many times each character is used we can
2046        remap the columns so that the table compression later on is more
2047        efficient in terms of memory by ensuring the most common value is in the
2048        middle and the least common are on the outside.  IMO this would be better
2049        than a most to least common mapping as theres a decent chance the most
2050        common letter will share a node with the least common, meaning the node
2051        will not be compressible. With a middle is most common approach the worst
2052        case is when we have the least common nodes twice.
2053
2054      */
2055
2056     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2057         regnode *noper = NEXTOPER( cur );
2058         const U8 *uc = (U8*)STRING( noper );
2059         const U8 *e  = uc + STR_LEN( noper );
2060         int foldlen = 0;
2061         U32 wordlen      = 0;         /* required init */
2062         STRLEN minchars = 0;
2063         STRLEN maxchars = 0;
2064         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2065                                                bitmap?*/
2066
2067         if (OP(noper) == NOTHING) {
2068             regnode *noper_next= regnext(noper);
2069             if (noper_next != tail && OP(noper_next) == flags) {
2070                 noper = noper_next;
2071                 uc= (U8*)STRING(noper);
2072                 e= uc + STR_LEN(noper);
2073                 trie->minlen= STR_LEN(noper);
2074             } else {
2075                 trie->minlen= 0;
2076                 continue;
2077             }
2078         }
2079
2080         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2081             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2082                                           regardless of encoding */
2083             if (OP( noper ) == EXACTFU_SS) {
2084                 /* false positives are ok, so just set this */
2085                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2086             }
2087         }
2088         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2089                                            branch */
2090             TRIE_CHARCOUNT(trie)++;
2091             TRIE_READ_CHAR;
2092
2093             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2094              * is in effect.  Under /i, this character can match itself, or
2095              * anything that folds to it.  If not under /i, it can match just
2096              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2097              * all fold to k, and all are single characters.   But some folds
2098              * expand to more than one character, so for example LATIN SMALL
2099              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2100              * the string beginning at 'uc' is 'ffi', it could be matched by
2101              * three characters, or just by the one ligature character. (It
2102              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2103              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2104              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2105              * match.)  The trie needs to know the minimum and maximum number
2106              * of characters that could match so that it can use size alone to
2107              * quickly reject many match attempts.  The max is simple: it is
2108              * the number of folded characters in this branch (since a fold is
2109              * never shorter than what folds to it. */
2110
2111             maxchars++;
2112
2113             /* And the min is equal to the max if not under /i (indicated by
2114              * 'folder' being NULL), or there are no multi-character folds.  If
2115              * there is a multi-character fold, the min is incremented just
2116              * once, for the character that folds to the sequence.  Each
2117              * character in the sequence needs to be added to the list below of
2118              * characters in the trie, but we count only the first towards the
2119              * min number of characters needed.  This is done through the
2120              * variable 'foldlen', which is returned by the macros that look
2121              * for these sequences as the number of bytes the sequence
2122              * occupies.  Each time through the loop, we decrement 'foldlen' by
2123              * how many bytes the current char occupies.  Only when it reaches
2124              * 0 do we increment 'minchars' or look for another multi-character
2125              * sequence. */
2126             if (folder == NULL) {
2127                 minchars++;
2128             }
2129             else if (foldlen > 0) {
2130                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2131             }
2132             else {
2133                 minchars++;
2134
2135                 /* See if *uc is the beginning of a multi-character fold.  If
2136                  * so, we decrement the length remaining to look at, to account
2137                  * for the current character this iteration.  (We can use 'uc'
2138                  * instead of the fold returned by TRIE_READ_CHAR because for
2139                  * non-UTF, the latin1_safe macro is smart enough to account
2140                  * for all the unfolded characters, and because for UTF, the
2141                  * string will already have been folded earlier in the
2142                  * compilation process */
2143                 if (UTF) {
2144                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2145                         foldlen -= UTF8SKIP(uc);
2146                     }
2147                 }
2148                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2149                     foldlen--;
2150                 }
2151             }
2152
2153             /* The current character (and any potential folds) should be added
2154              * to the possible matching characters for this position in this
2155              * branch */
2156             if ( uvc < 256 ) {
2157                 if ( folder ) {
2158                     U8 folded= folder[ (U8) uvc ];
2159                     if ( !trie->charmap[ folded ] ) {
2160                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2161                         TRIE_STORE_REVCHAR( folded );
2162                     }
2163                 }
2164                 if ( !trie->charmap[ uvc ] ) {
2165                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2166                     TRIE_STORE_REVCHAR( uvc );
2167                 }
2168                 if ( set_bit ) {
2169                     /* store the codepoint in the bitmap, and its folded
2170                      * equivalent. */
2171                     TRIE_BITMAP_SET(trie, uvc);
2172
2173                     /* store the folded codepoint */
2174                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2175
2176                     if ( !UTF ) {
2177                         /* store first byte of utf8 representation of
2178                            variant codepoints */
2179                         if (! UVCHR_IS_INVARIANT(uvc)) {
2180                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2181                         }
2182                     }
2183                     set_bit = 0; /* We've done our bit :-) */
2184                 }
2185             } else {
2186
2187                 /* XXX We could come up with the list of code points that fold
2188                  * to this using PL_utf8_foldclosures, except not for
2189                  * multi-char folds, as there may be multiple combinations
2190                  * there that could work, which needs to wait until runtime to
2191                  * resolve (The comment about LIGATURE FFI above is such an
2192                  * example */
2193
2194                 SV** svpp;
2195                 if ( !widecharmap )
2196                     widecharmap = newHV();
2197
2198                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2199
2200                 if ( !svpp )
2201                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2202
2203                 if ( !SvTRUE( *svpp ) ) {
2204                     sv_setiv( *svpp, ++trie->uniquecharcount );
2205                     TRIE_STORE_REVCHAR(uvc);
2206                 }
2207             }
2208         } /* end loop through characters in this branch of the trie */
2209
2210         /* We take the min and max for this branch and combine to find the min
2211          * and max for all branches processed so far */
2212         if( cur == first ) {
2213             trie->minlen = minchars;
2214             trie->maxlen = maxchars;
2215         } else if (minchars < trie->minlen) {
2216             trie->minlen = minchars;
2217         } else if (maxchars > trie->maxlen) {
2218             trie->maxlen = maxchars;
2219         }
2220     } /* end first pass */
2221     DEBUG_TRIE_COMPILE_r(
2222         PerlIO_printf( Perl_debug_log,
2223                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2224                 (int)depth * 2 + 2,"",
2225                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2226                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2227                 (int)trie->minlen, (int)trie->maxlen )
2228     );
2229
2230     /*
2231         We now know what we are dealing with in terms of unique chars and
2232         string sizes so we can calculate how much memory a naive
2233         representation using a flat table  will take. If it's over a reasonable
2234         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2235         conservative but potentially much slower representation using an array
2236         of lists.
2237
2238         At the end we convert both representations into the same compressed
2239         form that will be used in regexec.c for matching with. The latter
2240         is a form that cannot be used to construct with but has memory
2241         properties similar to the list form and access properties similar
2242         to the table form making it both suitable for fast searches and
2243         small enough that its feasable to store for the duration of a program.
2244
2245         See the comment in the code where the compressed table is produced
2246         inplace from the flat tabe representation for an explanation of how
2247         the compression works.
2248
2249     */
2250
2251
2252     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2253     prev_states[1] = 0;
2254
2255     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2256                                                     > SvIV(re_trie_maxbuff) )
2257     {
2258         /*
2259             Second Pass -- Array Of Lists Representation
2260
2261             Each state will be represented by a list of charid:state records
2262             (reg_trie_trans_le) the first such element holds the CUR and LEN
2263             points of the allocated array. (See defines above).
2264
2265             We build the initial structure using the lists, and then convert
2266             it into the compressed table form which allows faster lookups
2267             (but cant be modified once converted).
2268         */
2269
2270         STRLEN transcount = 1;
2271
2272         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2273             "%*sCompiling trie using list compiler\n",
2274             (int)depth * 2 + 2, ""));
2275
2276         trie->states = (reg_trie_state *)
2277             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2278                                   sizeof(reg_trie_state) );
2279         TRIE_LIST_NEW(1);
2280         next_alloc = 2;
2281
2282         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2283
2284             regnode *noper   = NEXTOPER( cur );
2285             U8 *uc           = (U8*)STRING( noper );
2286             const U8 *e      = uc + STR_LEN( noper );
2287             U32 state        = 1;         /* required init */
2288             U16 charid       = 0;         /* sanity init */
2289             U32 wordlen      = 0;         /* required init */
2290
2291             if (OP(noper) == NOTHING) {
2292                 regnode *noper_next= regnext(noper);
2293                 if (noper_next != tail && OP(noper_next) == flags) {
2294                     noper = noper_next;
2295                     uc= (U8*)STRING(noper);
2296                     e= uc + STR_LEN(noper);
2297                 }
2298             }
2299
2300             if (OP(noper) != NOTHING) {
2301                 for ( ; uc < e ; uc += len ) {
2302
2303                     TRIE_READ_CHAR;
2304
2305                     if ( uvc < 256 ) {
2306                         charid = trie->charmap[ uvc ];
2307                     } else {
2308                         SV** const svpp = hv_fetch( widecharmap,
2309                                                     (char*)&uvc,
2310                                                     sizeof( UV ),
2311                                                     0);
2312                         if ( !svpp ) {
2313                             charid = 0;
2314                         } else {
2315                             charid=(U16)SvIV( *svpp );
2316                         }
2317                     }
2318                     /* charid is now 0 if we dont know the char read, or
2319                      * nonzero if we do */
2320                     if ( charid ) {
2321
2322                         U16 check;
2323                         U32 newstate = 0;
2324
2325                         charid--;
2326                         if ( !trie->states[ state ].trans.list ) {
2327                             TRIE_LIST_NEW( state );
2328                         }
2329                         for ( check = 1;
2330                               check <= TRIE_LIST_USED( state );
2331                               check++ )
2332                         {
2333                             if ( TRIE_LIST_ITEM( state, check ).forid
2334                                                                     == charid )
2335                             {
2336                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2337                                 break;
2338                             }
2339                         }
2340                         if ( ! newstate ) {
2341                             newstate = next_alloc++;
2342                             prev_states[newstate] = state;
2343                             TRIE_LIST_PUSH( state, charid, newstate );
2344                             transcount++;
2345                         }
2346                         state = newstate;
2347                     } else {
2348                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2349                     }
2350                 }
2351             }
2352             TRIE_HANDLE_WORD(state);
2353
2354         } /* end second pass */
2355
2356         /* next alloc is the NEXT state to be allocated */
2357         trie->statecount = next_alloc;
2358         trie->states = (reg_trie_state *)
2359             PerlMemShared_realloc( trie->states,
2360                                    next_alloc
2361                                    * sizeof(reg_trie_state) );
2362
2363         /* and now dump it out before we compress it */
2364         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2365                                                          revcharmap, next_alloc,
2366                                                          depth+1)
2367         );
2368
2369         trie->trans = (reg_trie_trans *)
2370             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2371         {
2372             U32 state;
2373             U32 tp = 0;
2374             U32 zp = 0;
2375
2376
2377             for( state=1 ; state < next_alloc ; state ++ ) {
2378                 U32 base=0;
2379
2380                 /*
2381                 DEBUG_TRIE_COMPILE_MORE_r(
2382                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2383                 );
2384                 */
2385
2386                 if (trie->states[state].trans.list) {
2387                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2388                     U16 maxid=minid;
2389                     U16 idx;
2390
2391                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2392                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2393                         if ( forid < minid ) {
2394                             minid=forid;
2395                         } else if ( forid > maxid ) {
2396                             maxid=forid;
2397                         }
2398                     }
2399                     if ( transcount < tp + maxid - minid + 1) {
2400                         transcount *= 2;
2401                         trie->trans = (reg_trie_trans *)
2402                             PerlMemShared_realloc( trie->trans,
2403                                                      transcount
2404                                                      * sizeof(reg_trie_trans) );
2405                         Zero( trie->trans + (transcount / 2),
2406                               transcount / 2,
2407                               reg_trie_trans );
2408                     }
2409                     base = trie->uniquecharcount + tp - minid;
2410                     if ( maxid == minid ) {
2411                         U32 set = 0;
2412                         for ( ; zp < tp ; zp++ ) {
2413                             if ( ! trie->trans[ zp ].next ) {
2414                                 base = trie->uniquecharcount + zp - minid;
2415                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2416                                                                    1).newstate;
2417                                 trie->trans[ zp ].check = state;
2418                                 set = 1;
2419                                 break;
2420                             }
2421                         }
2422                         if ( !set ) {
2423                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2424                                                                    1).newstate;
2425                             trie->trans[ tp ].check = state;
2426                             tp++;
2427                             zp = tp;
2428                         }
2429                     } else {
2430                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2431                             const U32 tid = base
2432                                            - trie->uniquecharcount
2433                                            + TRIE_LIST_ITEM( state, idx ).forid;
2434                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2435                                                                 idx ).newstate;
2436                             trie->trans[ tid ].check = state;
2437                         }
2438                         tp += ( maxid - minid + 1 );
2439                     }
2440                     Safefree(trie->states[ state ].trans.list);
2441                 }
2442                 /*
2443                 DEBUG_TRIE_COMPILE_MORE_r(
2444                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2445                 );
2446                 */
2447                 trie->states[ state ].trans.base=base;
2448             }
2449             trie->lasttrans = tp + 1;
2450         }
2451     } else {
2452         /*
2453            Second Pass -- Flat Table Representation.
2454
2455            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2456            each.  We know that we will need Charcount+1 trans at most to store
2457            the data (one row per char at worst case) So we preallocate both
2458            structures assuming worst case.
2459
2460            We then construct the trie using only the .next slots of the entry
2461            structs.
2462
2463            We use the .check field of the first entry of the node temporarily
2464            to make compression both faster and easier by keeping track of how
2465            many non zero fields are in the node.
2466
2467            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2468            transition.
2469
2470            There are two terms at use here: state as a TRIE_NODEIDX() which is
2471            a number representing the first entry of the node, and state as a
2472            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2473            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2474            if there are 2 entrys per node. eg:
2475
2476              A B       A B
2477           1. 2 4    1. 3 7
2478           2. 0 3    3. 0 5
2479           3. 0 0    5. 0 0
2480           4. 0 0    7. 0 0
2481
2482            The table is internally in the right hand, idx form. However as we
2483            also have to deal with the states array which is indexed by nodenum
2484            we have to use TRIE_NODENUM() to convert.
2485
2486         */
2487         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2488             "%*sCompiling trie using table compiler\n",
2489             (int)depth * 2 + 2, ""));
2490
2491         trie->trans = (reg_trie_trans *)
2492             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2493                                   * trie->uniquecharcount + 1,
2494                                   sizeof(reg_trie_trans) );
2495         trie->states = (reg_trie_state *)
2496             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2497                                   sizeof(reg_trie_state) );
2498         next_alloc = trie->uniquecharcount + 1;
2499
2500
2501         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2502
2503             regnode *noper   = NEXTOPER( cur );
2504             const U8 *uc     = (U8*)STRING( noper );
2505             const U8 *e      = uc + STR_LEN( noper );
2506
2507             U32 state        = 1;         /* required init */
2508
2509             U16 charid       = 0;         /* sanity init */
2510             U32 accept_state = 0;         /* sanity init */
2511
2512             U32 wordlen      = 0;         /* required init */
2513
2514             if (OP(noper) == NOTHING) {
2515                 regnode *noper_next= regnext(noper);
2516                 if (noper_next != tail && OP(noper_next) == flags) {
2517                     noper = noper_next;
2518                     uc= (U8*)STRING(noper);
2519                     e= uc + STR_LEN(noper);
2520                 }
2521             }
2522
2523             if ( OP(noper) != NOTHING ) {
2524                 for ( ; uc < e ; uc += len ) {
2525
2526                     TRIE_READ_CHAR;
2527
2528                     if ( uvc < 256 ) {
2529                         charid = trie->charmap[ uvc ];
2530                     } else {
2531                         SV* const * const svpp = hv_fetch( widecharmap,
2532                                                            (char*)&uvc,
2533                                                            sizeof( UV ),
2534                                                            0);
2535                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2536                     }
2537                     if ( charid ) {
2538                         charid--;
2539                         if ( !trie->trans[ state + charid ].next ) {
2540                             trie->trans[ state + charid ].next = next_alloc;
2541                             trie->trans[ state ].check++;
2542                             prev_states[TRIE_NODENUM(next_alloc)]
2543                                     = TRIE_NODENUM(state);
2544                             next_alloc += trie->uniquecharcount;
2545                         }
2546                         state = trie->trans[ state + charid ].next;
2547                     } else {
2548                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2549                     }
2550                     /* charid is now 0 if we dont know the char read, or
2551                      * nonzero if we do */
2552                 }
2553             }
2554             accept_state = TRIE_NODENUM( state );
2555             TRIE_HANDLE_WORD(accept_state);
2556
2557         } /* end second pass */
2558
2559         /* and now dump it out before we compress it */
2560         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2561                                                           revcharmap,
2562                                                           next_alloc, depth+1));
2563
2564         {
2565         /*
2566            * Inplace compress the table.*
2567
2568            For sparse data sets the table constructed by the trie algorithm will
2569            be mostly 0/FAIL transitions or to put it another way mostly empty.
2570            (Note that leaf nodes will not contain any transitions.)
2571
2572            This algorithm compresses the tables by eliminating most such
2573            transitions, at the cost of a modest bit of extra work during lookup:
2574
2575            - Each states[] entry contains a .base field which indicates the
2576            index in the state[] array wheres its transition data is stored.
2577
2578            - If .base is 0 there are no valid transitions from that node.
2579
2580            - If .base is nonzero then charid is added to it to find an entry in
2581            the trans array.
2582
2583            -If trans[states[state].base+charid].check!=state then the
2584            transition is taken to be a 0/Fail transition. Thus if there are fail
2585            transitions at the front of the node then the .base offset will point
2586            somewhere inside the previous nodes data (or maybe even into a node
2587            even earlier), but the .check field determines if the transition is
2588            valid.
2589
2590            XXX - wrong maybe?
2591            The following process inplace converts the table to the compressed
2592            table: We first do not compress the root node 1,and mark all its
2593            .check pointers as 1 and set its .base pointer as 1 as well. This
2594            allows us to do a DFA construction from the compressed table later,
2595            and ensures that any .base pointers we calculate later are greater
2596            than 0.
2597
2598            - We set 'pos' to indicate the first entry of the second node.
2599
2600            - We then iterate over the columns of the node, finding the first and
2601            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2602            and set the .check pointers accordingly, and advance pos
2603            appropriately and repreat for the next node. Note that when we copy
2604            the next pointers we have to convert them from the original
2605            NODEIDX form to NODENUM form as the former is not valid post
2606            compression.
2607
2608            - If a node has no transitions used we mark its base as 0 and do not
2609            advance the pos pointer.
2610
2611            - If a node only has one transition we use a second pointer into the
2612            structure to fill in allocated fail transitions from other states.
2613            This pointer is independent of the main pointer and scans forward
2614            looking for null transitions that are allocated to a state. When it
2615            finds one it writes the single transition into the "hole".  If the
2616            pointer doesnt find one the single transition is appended as normal.
2617
2618            - Once compressed we can Renew/realloc the structures to release the
2619            excess space.
2620
2621            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2622            specifically Fig 3.47 and the associated pseudocode.
2623
2624            demq
2625         */
2626         const U32 laststate = TRIE_NODENUM( next_alloc );
2627         U32 state, charid;
2628         U32 pos = 0, zp=0;
2629         trie->statecount = laststate;
2630
2631         for ( state = 1 ; state < laststate ; state++ ) {
2632             U8 flag = 0;
2633             const U32 stateidx = TRIE_NODEIDX( state );
2634             const U32 o_used = trie->trans[ stateidx ].check;
2635             U32 used = trie->trans[ stateidx ].check;
2636             trie->trans[ stateidx ].check = 0;
2637
2638             for ( charid = 0;
2639                   used && charid < trie->uniquecharcount;
2640                   charid++ )
2641             {
2642                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2643                     if ( trie->trans[ stateidx + charid ].next ) {
2644                         if (o_used == 1) {
2645                             for ( ; zp < pos ; zp++ ) {
2646                                 if ( ! trie->trans[ zp ].next ) {
2647                                     break;
2648                                 }
2649                             }
2650                             trie->states[ state ].trans.base
2651                                                     = zp
2652                                                       + trie->uniquecharcount
2653                                                       - charid ;
2654                             trie->trans[ zp ].next
2655                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2656                                                              + charid ].next );
2657                             trie->trans[ zp ].check = state;
2658                             if ( ++zp > pos ) pos = zp;
2659                             break;
2660                         }
2661                         used--;
2662                     }
2663                     if ( !flag ) {
2664                         flag = 1;
2665                         trie->states[ state ].trans.base
2666                                        = pos + trie->uniquecharcount - charid ;
2667                     }
2668                     trie->trans[ pos ].next
2669                         = SAFE_TRIE_NODENUM(
2670                                        trie->trans[ stateidx + charid ].next );
2671                     trie->trans[ pos ].check = state;
2672                     pos++;
2673                 }
2674             }
2675         }
2676         trie->lasttrans = pos + 1;
2677         trie->states = (reg_trie_state *)
2678             PerlMemShared_realloc( trie->states, laststate
2679                                    * sizeof(reg_trie_state) );
2680         DEBUG_TRIE_COMPILE_MORE_r(
2681             PerlIO_printf( Perl_debug_log,
2682                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2683                 (int)depth * 2 + 2,"",
2684                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2685                        + 1 ),
2686                 (IV)next_alloc,
2687                 (IV)pos,
2688                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2689             );
2690
2691         } /* end table compress */
2692     }
2693     DEBUG_TRIE_COMPILE_MORE_r(
2694             PerlIO_printf(Perl_debug_log,
2695                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2696                 (int)depth * 2 + 2, "",
2697                 (UV)trie->statecount,
2698                 (UV)trie->lasttrans)
2699     );
2700     /* resize the trans array to remove unused space */
2701     trie->trans = (reg_trie_trans *)
2702         PerlMemShared_realloc( trie->trans, trie->lasttrans
2703                                * sizeof(reg_trie_trans) );
2704
2705     {   /* Modify the program and insert the new TRIE node */
2706         U8 nodetype =(U8)(flags & 0xFF);
2707         char *str=NULL;
2708
2709 #ifdef DEBUGGING
2710         regnode *optimize = NULL;
2711 #ifdef RE_TRACK_PATTERN_OFFSETS
2712
2713         U32 mjd_offset = 0;
2714         U32 mjd_nodelen = 0;
2715 #endif /* RE_TRACK_PATTERN_OFFSETS */
2716 #endif /* DEBUGGING */
2717         /*
2718            This means we convert either the first branch or the first Exact,
2719            depending on whether the thing following (in 'last') is a branch
2720            or not and whther first is the startbranch (ie is it a sub part of
2721            the alternation or is it the whole thing.)
2722            Assuming its a sub part we convert the EXACT otherwise we convert
2723            the whole branch sequence, including the first.
2724          */
2725         /* Find the node we are going to overwrite */
2726         if ( first != startbranch || OP( last ) == BRANCH ) {
2727             /* branch sub-chain */
2728             NEXT_OFF( first ) = (U16)(last - first);
2729 #ifdef RE_TRACK_PATTERN_OFFSETS
2730             DEBUG_r({
2731                 mjd_offset= Node_Offset((convert));
2732                 mjd_nodelen= Node_Length((convert));
2733             });
2734 #endif
2735             /* whole branch chain */
2736         }
2737 #ifdef RE_TRACK_PATTERN_OFFSETS
2738         else {
2739             DEBUG_r({
2740                 const  regnode *nop = NEXTOPER( convert );
2741                 mjd_offset= Node_Offset((nop));
2742                 mjd_nodelen= Node_Length((nop));
2743             });
2744         }
2745         DEBUG_OPTIMISE_r(
2746             PerlIO_printf(Perl_debug_log,
2747                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2748                 (int)depth * 2 + 2, "",
2749                 (UV)mjd_offset, (UV)mjd_nodelen)
2750         );
2751 #endif
2752         /* But first we check to see if there is a common prefix we can
2753            split out as an EXACT and put in front of the TRIE node.  */
2754         trie->startstate= 1;
2755         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2756             U32 state;
2757             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2758                 U32 ofs = 0;
2759                 I32 idx = -1;
2760                 U32 count = 0;
2761                 const U32 base = trie->states[ state ].trans.base;
2762
2763                 if ( trie->states[state].wordnum )
2764                         count = 1;
2765
2766                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2767                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2768                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2769                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2770                     {
2771                         if ( ++count > 1 ) {
2772                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2773                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2774                             if ( state == 1 ) break;
2775                             if ( count == 2 ) {
2776                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2777                                 DEBUG_OPTIMISE_r(
2778                                     PerlIO_printf(Perl_debug_log,
2779                                         "%*sNew Start State=%"UVuf" Class: [",
2780                                         (int)depth * 2 + 2, "",
2781                                         (UV)state));
2782                                 if (idx >= 0) {
2783                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2784                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2785
2786                                     TRIE_BITMAP_SET(trie,*ch);
2787                                     if ( folder )
2788                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2789                                     DEBUG_OPTIMISE_r(
2790                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2791                                     );
2792                                 }
2793                             }
2794                             TRIE_BITMAP_SET(trie,*ch);
2795                             if ( folder )
2796                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2797                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2798                         }
2799                         idx = ofs;
2800                     }
2801                 }
2802                 if ( count == 1 ) {
2803                     SV **tmp = av_fetch( revcharmap, idx, 0);
2804                     STRLEN len;
2805                     char *ch = SvPV( *tmp, len );
2806                     DEBUG_OPTIMISE_r({
2807                         SV *sv=sv_newmortal();
2808                         PerlIO_printf( Perl_debug_log,
2809                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2810                             (int)depth * 2 + 2, "",
2811                             (UV)state, (UV)idx,
2812                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2813                                 PL_colors[0], PL_colors[1],
2814                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2815                                 PERL_PV_ESCAPE_FIRSTCHAR
2816                             )
2817                         );
2818                     });
2819                     if ( state==1 ) {
2820                         OP( convert ) = nodetype;
2821                         str=STRING(convert);
2822                         STR_LEN(convert)=0;
2823                     }
2824                     STR_LEN(convert) += len;
2825                     while (len--)
2826                         *str++ = *ch++;
2827                 } else {
2828 #ifdef DEBUGGING
2829                     if (state>1)
2830                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2831 #endif
2832                     break;
2833                 }
2834             }
2835             trie->prefixlen = (state-1);
2836             if (str) {
2837                 regnode *n = convert+NODE_SZ_STR(convert);
2838                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2839                 trie->startstate = state;
2840                 trie->minlen -= (state - 1);
2841                 trie->maxlen -= (state - 1);
2842 #ifdef DEBUGGING
2843                /* At least the UNICOS C compiler choked on this
2844                 * being argument to DEBUG_r(), so let's just have
2845                 * it right here. */
2846                if (
2847 #ifdef PERL_EXT_RE_BUILD
2848                    1
2849 #else
2850                    DEBUG_r_TEST
2851 #endif
2852                    ) {
2853                    regnode *fix = convert;
2854                    U32 word = trie->wordcount;
2855                    mjd_nodelen++;
2856                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2857                    while( ++fix < n ) {
2858                        Set_Node_Offset_Length(fix, 0, 0);
2859                    }
2860                    while (word--) {
2861                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2862                        if (tmp) {
2863                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2864                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2865                            else
2866                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2867                        }
2868                    }
2869                }
2870 #endif
2871                 if (trie->maxlen) {
2872                     convert = n;
2873                 } else {
2874                     NEXT_OFF(convert) = (U16)(tail - convert);
2875                     DEBUG_r(optimize= n);
2876                 }
2877             }
2878         }
2879         if (!jumper)
2880             jumper = last;
2881         if ( trie->maxlen ) {
2882             NEXT_OFF( convert ) = (U16)(tail - convert);
2883             ARG_SET( convert, data_slot );
2884             /* Store the offset to the first unabsorbed branch in
2885                jump[0], which is otherwise unused by the jump logic.
2886                We use this when dumping a trie and during optimisation. */
2887             if (trie->jump)
2888                 trie->jump[0] = (U16)(nextbranch - convert);
2889
2890             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2891              *   and there is a bitmap
2892              *   and the first "jump target" node we found leaves enough room
2893              * then convert the TRIE node into a TRIEC node, with the bitmap
2894              * embedded inline in the opcode - this is hypothetically faster.
2895              */
2896             if ( !trie->states[trie->startstate].wordnum
2897                  && trie->bitmap
2898                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2899             {
2900                 OP( convert ) = TRIEC;
2901                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2902                 PerlMemShared_free(trie->bitmap);
2903                 trie->bitmap= NULL;
2904             } else
2905                 OP( convert ) = TRIE;
2906
2907             /* store the type in the flags */
2908             convert->flags = nodetype;
2909             DEBUG_r({
2910             optimize = convert
2911                       + NODE_STEP_REGNODE
2912                       + regarglen[ OP( convert ) ];
2913             });
2914             /* XXX We really should free up the resource in trie now,
2915                    as we won't use them - (which resources?) dmq */
2916         }
2917         /* needed for dumping*/
2918         DEBUG_r(if (optimize) {
2919             regnode *opt = convert;
2920
2921             while ( ++opt < optimize) {
2922                 Set_Node_Offset_Length(opt,0,0);
2923             }
2924             /*
2925                 Try to clean up some of the debris left after the
2926                 optimisation.
2927              */
2928             while( optimize < jumper ) {
2929                 mjd_nodelen += Node_Length((optimize));
2930                 OP( optimize ) = OPTIMIZED;
2931                 Set_Node_Offset_Length(optimize,0,0);
2932                 optimize++;
2933             }
2934             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2935         });
2936     } /* end node insert */
2937
2938     /*  Finish populating the prev field of the wordinfo array.  Walk back
2939      *  from each accept state until we find another accept state, and if
2940      *  so, point the first word's .prev field at the second word. If the
2941      *  second already has a .prev field set, stop now. This will be the
2942      *  case either if we've already processed that word's accept state,
2943      *  or that state had multiple words, and the overspill words were
2944      *  already linked up earlier.
2945      */
2946     {
2947         U16 word;
2948         U32 state;
2949         U16 prev;
2950
2951         for (word=1; word <= trie->wordcount; word++) {
2952             prev = 0;
2953             if (trie->wordinfo[word].prev)
2954                 continue;
2955             state = trie->wordinfo[word].accept;
2956             while (state) {
2957                 state = prev_states[state];
2958                 if (!state)
2959                     break;
2960                 prev = trie->states[state].wordnum;
2961                 if (prev)
2962                     break;
2963             }
2964             trie->wordinfo[word].prev = prev;
2965         }
2966         Safefree(prev_states);
2967     }
2968
2969
2970     /* and now dump out the compressed format */
2971     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2972
2973     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2974 #ifdef DEBUGGING
2975     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2976     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2977 #else
2978     SvREFCNT_dec_NN(revcharmap);
2979 #endif
2980     return trie->jump
2981            ? MADE_JUMP_TRIE
2982            : trie->startstate>1
2983              ? MADE_EXACT_TRIE
2984              : MADE_TRIE;
2985 }
2986
2987 STATIC regnode *
2988 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2989 {
2990 /* The Trie is constructed and compressed now so we can build a fail array if
2991  * it's needed
2992
2993    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2994    3.32 in the
2995    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2996    Ullman 1985/88
2997    ISBN 0-201-10088-6
2998
2999    We find the fail state for each state in the trie, this state is the longest
3000    proper suffix of the current state's 'word' that is also a proper prefix of
3001    another word in our trie. State 1 represents the word '' and is thus the
3002    default fail state. This allows the DFA not to have to restart after its
3003    tried and failed a word at a given point, it simply continues as though it
3004    had been matching the other word in the first place.
3005    Consider
3006       'abcdgu'=~/abcdefg|cdgu/
3007    When we get to 'd' we are still matching the first word, we would encounter
3008    'g' which would fail, which would bring us to the state representing 'd' in
3009    the second word where we would try 'g' and succeed, proceeding to match
3010    'cdgu'.
3011  */
3012  /* add a fail transition */
3013     const U32 trie_offset = ARG(source);
3014     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3015     U32 *q;
3016     const U32 ucharcount = trie->uniquecharcount;
3017     const U32 numstates = trie->statecount;
3018     const U32 ubound = trie->lasttrans + ucharcount;
3019     U32 q_read = 0;
3020     U32 q_write = 0;
3021     U32 charid;
3022     U32 base = trie->states[ 1 ].trans.base;
3023     U32 *fail;
3024     reg_ac_data *aho;
3025     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3026     regnode *stclass;
3027     GET_RE_DEBUG_FLAGS_DECL;
3028
3029     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3030 #ifndef DEBUGGING
3031     PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034     if ( OP(source) == TRIE ) {
3035         struct regnode_1 *op = (struct regnode_1 *)
3036             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3037         StructCopy(source,op,struct regnode_1);
3038         stclass = (regnode *)op;
3039     } else {
3040         struct regnode_charclass *op = (struct regnode_charclass *)
3041             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3042         StructCopy(source,op,struct regnode_charclass);
3043         stclass = (regnode *)op;
3044     }
3045     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3046
3047     ARG_SET( stclass, data_slot );
3048     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3049     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3050     aho->trie=trie_offset;
3051     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3052     Copy( trie->states, aho->states, numstates, reg_trie_state );
3053     Newxz( q, numstates, U32);
3054     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3055     aho->refcount = 1;
3056     fail = aho->fail;
3057     /* initialize fail[0..1] to be 1 so that we always have
3058        a valid final fail state */
3059     fail[ 0 ] = fail[ 1 ] = 1;
3060
3061     for ( charid = 0; charid < ucharcount ; charid++ ) {
3062         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3063         if ( newstate ) {
3064             q[ q_write ] = newstate;
3065             /* set to point at the root */
3066             fail[ q[ q_write++ ] ]=1;
3067         }
3068     }
3069     while ( q_read < q_write) {
3070         const U32 cur = q[ q_read++ % numstates ];
3071         base = trie->states[ cur ].trans.base;
3072
3073         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3074             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3075             if (ch_state) {
3076                 U32 fail_state = cur;
3077                 U32 fail_base;
3078                 do {
3079                     fail_state = fail[ fail_state ];
3080                     fail_base = aho->states[ fail_state ].trans.base;
3081                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3082
3083                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3084                 fail[ ch_state ] = fail_state;
3085                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3086                 {
3087                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3088                 }
3089                 q[ q_write++ % numstates] = ch_state;
3090             }
3091         }
3092     }
3093     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3094        when we fail in state 1, this allows us to use the
3095        charclass scan to find a valid start char. This is based on the principle
3096        that theres a good chance the string being searched contains lots of stuff
3097        that cant be a start char.
3098      */
3099     fail[ 0 ] = fail[ 1 ] = 0;
3100     DEBUG_TRIE_COMPILE_r({
3101         PerlIO_printf(Perl_debug_log,
3102                       "%*sStclass Failtable (%"UVuf" states): 0",
3103                       (int)(depth * 2), "", (UV)numstates
3104         );
3105         for( q_read=1; q_read<numstates; q_read++ ) {
3106             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3107         }
3108         PerlIO_printf(Perl_debug_log, "\n");
3109     });
3110     Safefree(q);
3111     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3112     return stclass;
3113 }
3114
3115
3116 #define DEBUG_PEEP(str,scan,depth) \
3117     DEBUG_OPTIMISE_r({if (scan){ \
3118        SV * const mysv=sv_newmortal(); \
3119        regnode *Next = regnext(scan); \
3120        regprop(RExC_rx, mysv, scan, NULL); \
3121        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3122        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3123        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3124    }});
3125
3126
3127 /* The below joins as many adjacent EXACTish nodes as possible into a single
3128  * one.  The regop may be changed if the node(s) contain certain sequences that
3129  * require special handling.  The joining is only done if:
3130  * 1) there is room in the current conglomerated node to entirely contain the
3131  *    next one.
3132  * 2) they are the exact same node type
3133  *
3134  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3135  * these get optimized out
3136  *
3137  * If a node is to match under /i (folded), the number of characters it matches
3138  * can be different than its character length if it contains a multi-character
3139  * fold.  *min_subtract is set to the total delta number of characters of the
3140  * input nodes.
3141  *
3142  * And *unfolded_multi_char is set to indicate whether or not the node contains
3143  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3144  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3145  * SMALL LETTER SHARP S, as only if the target string being matched against
3146  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3147  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3148  * whose components are all above the Latin1 range are not run-time locale
3149  * dependent, and have already been folded by the time this function is
3150  * called.)
3151  *
3152  * This is as good a place as any to discuss the design of handling these
3153  * multi-character fold sequences.  It's been wrong in Perl for a very long
3154  * time.  There are three code points in Unicode whose multi-character folds
3155  * were long ago discovered to mess things up.  The previous designs for
3156  * dealing with these involved assigning a special node for them.  This
3157  * approach doesn't always work, as evidenced by this example:
3158  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3159  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3160  * would match just the \xDF, it won't be able to handle the case where a
3161  * successful match would have to cross the node's boundary.  The new approach
3162  * that hopefully generally solves the problem generates an EXACTFU_SS node
3163  * that is "sss" in this case.
3164  *
3165  * It turns out that there are problems with all multi-character folds, and not
3166  * just these three.  Now the code is general, for all such cases.  The
3167  * approach taken is:
3168  * 1)   This routine examines each EXACTFish node that could contain multi-
3169  *      character folded sequences.  Since a single character can fold into
3170  *      such a sequence, the minimum match length for this node is less than
3171  *      the number of characters in the node.  This routine returns in
3172  *      *min_subtract how many characters to subtract from the the actual
3173  *      length of the string to get a real minimum match length; it is 0 if
3174  *      there are no multi-char foldeds.  This delta is used by the caller to
3175  *      adjust the min length of the match, and the delta between min and max,
3176  *      so that the optimizer doesn't reject these possibilities based on size
3177  *      constraints.
3178  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3179  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3180  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3181  *      there is a possible fold length change.  That means that a regular
3182  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3183  *      with length changes, and so can be processed faster.  regexec.c takes
3184  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3185  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3186  *      known until runtime).  This saves effort in regex matching.  However,
3187  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3188  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3189  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3190  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3191  *      possibilities for the non-UTF8 patterns are quite simple, except for
3192  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3193  *      members of a fold-pair, and arrays are set up for all of them so that
3194  *      the other member of the pair can be found quickly.  Code elsewhere in
3195  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3196  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3197  *      described in the next item.
3198  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3199  *      validity of the fold won't be known until runtime, and so must remain
3200  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3201  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3202  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3203  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3204  *      The reason this is a problem is that the optimizer part of regexec.c
3205  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3206  *      that a character in the pattern corresponds to at most a single
3207  *      character in the target string.  (And I do mean character, and not byte
3208  *      here, unlike other parts of the documentation that have never been
3209  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3210  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3211  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3212  *      nodes, violate the assumption, and they are the only instances where it
3213  *      is violated.  I'm reluctant to try to change the assumption, as the
3214  *      code involved is impenetrable to me (khw), so instead the code here
3215  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3216  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3217  *      boolean indicating whether or not the node contains such a fold.  When
3218  *      it is true, the caller sets a flag that later causes the optimizer in
3219  *      this file to not set values for the floating and fixed string lengths,
3220  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3221  *      assumption.  Thus, there is no optimization based on string lengths for
3222  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3223  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3224  *      assumption is wrong only in these cases is that all other non-UTF-8
3225  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3226  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3227  *      EXACTF nodes because we don't know at compile time if it actually
3228  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3229  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3230  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3231  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3232  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3233  *      string would require the pattern to be forced into UTF-8, the overhead
3234  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3235  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3236  *      locale.)
3237  *
3238  *      Similarly, the code that generates tries doesn't currently handle
3239  *      not-already-folded multi-char folds, and it looks like a pain to change
3240  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3241  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3242  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3243  *      using /iaa matching will be doing so almost entirely with ASCII
3244  *      strings, so this should rarely be encountered in practice */
3245
3246 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3247     if (PL_regkind[OP(scan)] == EXACT) \
3248         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3249
3250 STATIC U32
3251 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3252                    UV *min_subtract, bool *unfolded_multi_char,
3253                    U32 flags,regnode *val, U32 depth)
3254 {
3255     /* Merge several consecutive EXACTish nodes into one. */
3256     regnode *n = regnext(scan);
3257     U32 stringok = 1;
3258     regnode *next = scan + NODE_SZ_STR(scan);
3259     U32 merged = 0;
3260     U32 stopnow = 0;
3261 #ifdef DEBUGGING
3262     regnode *stop = scan;
3263     GET_RE_DEBUG_FLAGS_DECL;
3264 #else
3265     PERL_UNUSED_ARG(depth);
3266 #endif
3267
3268     PERL_ARGS_ASSERT_JOIN_EXACT;
3269 #ifndef EXPERIMENTAL_INPLACESCAN
3270     PERL_UNUSED_ARG(flags);
3271     PERL_UNUSED_ARG(val);
3272 #endif
3273     DEBUG_PEEP("join",scan,depth);
3274
3275     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3276      * EXACT ones that are mergeable to the current one. */
3277     while (n
3278            && (PL_regkind[OP(n)] == NOTHING
3279                || (stringok && OP(n) == OP(scan)))
3280            && NEXT_OFF(n)
3281            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3282     {
3283
3284         if (OP(n) == TAIL || n > next)
3285             stringok = 0;
3286         if (PL_regkind[OP(n)] == NOTHING) {
3287             DEBUG_PEEP("skip:",n,depth);
3288             NEXT_OFF(scan) += NEXT_OFF(n);
3289             next = n + NODE_STEP_REGNODE;
3290 #ifdef DEBUGGING
3291             if (stringok)
3292                 stop = n;
3293 #endif
3294             n = regnext(n);
3295         }
3296         else if (stringok) {
3297             const unsigned int oldl = STR_LEN(scan);
3298             regnode * const nnext = regnext(n);
3299
3300             /* XXX I (khw) kind of doubt that this works on platforms (should
3301              * Perl ever run on one) where U8_MAX is above 255 because of lots
3302              * of other assumptions */
3303             /* Don't join if the sum can't fit into a single node */
3304             if (oldl + STR_LEN(n) > U8_MAX)
3305                 break;
3306
3307             DEBUG_PEEP("merg",n,depth);
3308             merged++;
3309
3310             NEXT_OFF(scan) += NEXT_OFF(n);
3311             STR_LEN(scan) += STR_LEN(n);
3312             next = n + NODE_SZ_STR(n);
3313             /* Now we can overwrite *n : */
3314             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3315 #ifdef DEBUGGING
3316             stop = next - 1;
3317 #endif
3318             n = nnext;
3319             if (stopnow) break;
3320         }
3321
3322 #ifdef EXPERIMENTAL_INPLACESCAN
3323         if (flags && !NEXT_OFF(n)) {
3324             DEBUG_PEEP("atch", val, depth);
3325             if (reg_off_by_arg[OP(n)]) {
3326                 ARG_SET(n, val - n);
3327             }
3328             else {
3329                 NEXT_OFF(n) = val - n;
3330             }
3331             stopnow = 1;
3332         }
3333 #endif
3334     }
3335
3336     *min_subtract = 0;
3337     *unfolded_multi_char = FALSE;
3338
3339     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3340      * can now analyze for sequences of problematic code points.  (Prior to
3341      * this final joining, sequences could have been split over boundaries, and
3342      * hence missed).  The sequences only happen in folding, hence for any
3343      * non-EXACT EXACTish node */
3344     if (OP(scan) != EXACT) {
3345         U8* s0 = (U8*) STRING(scan);
3346         U8* s = s0;
3347         U8* s_end = s0 + STR_LEN(scan);
3348
3349         int total_count_delta = 0;  /* Total delta number of characters that
3350                                        multi-char folds expand to */
3351
3352         /* One pass is made over the node's string looking for all the
3353          * possibilities.  To avoid some tests in the loop, there are two main
3354          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3355          * non-UTF-8 */
3356         if (UTF) {
3357             U8* folded = NULL;
3358
3359             if (OP(scan) == EXACTFL) {
3360                 U8 *d;
3361
3362                 /* An EXACTFL node would already have been changed to another
3363                  * node type unless there is at least one character in it that
3364                  * is problematic; likely a character whose fold definition
3365                  * won't be known until runtime, and so has yet to be folded.
3366                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3367                  * to handle the UTF-8 case, we need to create a temporary
3368                  * folded copy using UTF-8 locale rules in order to analyze it.
3369                  * This is because our macros that look to see if a sequence is
3370                  * a multi-char fold assume everything is folded (otherwise the
3371                  * tests in those macros would be too complicated and slow).
3372                  * Note that here, the non-problematic folds will have already
3373                  * been done, so we can just copy such characters.  We actually
3374                  * don't completely fold the EXACTFL string.  We skip the
3375                  * unfolded multi-char folds, as that would just create work
3376                  * below to figure out the size they already are */
3377
3378                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3379                 d = folded;
3380                 while (s < s_end) {
3381                     STRLEN s_len = UTF8SKIP(s);
3382                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3383                         Copy(s, d, s_len, U8);
3384                         d += s_len;
3385                     }
3386                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3387                         *unfolded_multi_char = TRUE;
3388                         Copy(s, d, s_len, U8);
3389                         d += s_len;
3390                     }
3391                     else if (isASCII(*s)) {
3392                         *(d++) = toFOLD(*s);
3393                     }
3394                     else {
3395                         STRLEN len;
3396                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3397                         d += len;
3398                     }
3399                     s += s_len;
3400                 }
3401
3402                 /* Point the remainder of the routine to look at our temporary
3403                  * folded copy */
3404                 s = folded;
3405                 s_end = d;
3406             } /* End of creating folded copy of EXACTFL string */
3407
3408             /* Examine the string for a multi-character fold sequence.  UTF-8
3409              * patterns have all characters pre-folded by the time this code is
3410              * executed */
3411             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3412                                      length sequence we are looking for is 2 */
3413             {
3414                 int count = 0;  /* How many characters in a multi-char fold */
3415                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3416                 if (! len) {    /* Not a multi-char fold: get next char */
3417                     s += UTF8SKIP(s);
3418                     continue;
3419                 }
3420
3421                 /* Nodes with 'ss' require special handling, except for
3422                  * EXACTFA-ish for which there is no multi-char fold to this */
3423                 if (len == 2 && *s == 's' && *(s+1) == 's'
3424                     && OP(scan) != EXACTFA
3425                     && OP(scan) != EXACTFA_NO_TRIE)
3426                 {
3427                     count = 2;
3428                     if (OP(scan) != EXACTFL) {
3429                         OP(scan) = EXACTFU_SS;
3430                     }
3431                     s += 2;
3432                 }
3433                 else { /* Here is a generic multi-char fold. */
3434                     U8* multi_end  = s + len;
3435
3436                     /* Count how many characters are in it.  In the case of
3437                      * /aa, no folds which contain ASCII code points are
3438                      * allowed, so check for those, and skip if found. */
3439                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3440                         count = utf8_length(s, multi_end);
3441                         s = multi_end;
3442                     }
3443                     else {
3444                         while (s < multi_end) {
3445                             if (isASCII(*s)) {
3446                                 s++;
3447                                 goto next_iteration;
3448                             }
3449                             else {
3450                                 s += UTF8SKIP(s);
3451                             }
3452                             count++;
3453                         }
3454                     }
3455                 }
3456
3457                 /* The delta is how long the sequence is minus 1 (1 is how long
3458                  * the character that folds to the sequence is) */
3459                 total_count_delta += count - 1;
3460               next_iteration: ;
3461             }
3462
3463             /* We created a temporary folded copy of the string in EXACTFL
3464              * nodes.  Therefore we need to be sure it doesn't go below zero,
3465              * as the real string could be shorter */
3466             if (OP(scan) == EXACTFL) {
3467                 int total_chars = utf8_length((U8*) STRING(scan),
3468                                            (U8*) STRING(scan) + STR_LEN(scan));
3469                 if (total_count_delta > total_chars) {
3470                     total_count_delta = total_chars;
3471                 }
3472             }
3473
3474             *min_subtract += total_count_delta;
3475             Safefree(folded);
3476         }
3477         else if (OP(scan) == EXACTFA) {
3478
3479             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3480              * fold to the ASCII range (and there are no existing ones in the
3481              * upper latin1 range).  But, as outlined in the comments preceding
3482              * this function, we need to flag any occurrences of the sharp s.
3483              * This character forbids trie formation (because of added
3484              * complexity) */
3485             while (s < s_end) {
3486                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3487                     OP(scan) = EXACTFA_NO_TRIE;
3488                     *unfolded_multi_char = TRUE;
3489                     break;
3490                 }
3491                 s++;
3492                 continue;
3493             }
3494         }
3495         else {
3496
3497             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3498              * folds that are all Latin1.  As explained in the comments
3499              * preceding this function, we look also for the sharp s in EXACTF
3500              * and EXACTFL nodes; it can be in the final position.  Otherwise
3501              * we can stop looking 1 byte earlier because have to find at least
3502              * two characters for a multi-fold */
3503             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3504                               ? s_end
3505                               : s_end -1;
3506
3507             while (s < upper) {
3508                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3509                 if (! len) {    /* Not a multi-char fold. */
3510                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3511                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3512                     {
3513                         *unfolded_multi_char = TRUE;
3514                     }
3515                     s++;
3516                     continue;
3517                 }
3518
3519                 if (len == 2
3520                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3521                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3522                 {
3523
3524                     /* EXACTF nodes need to know that the minimum length
3525                      * changed so that a sharp s in the string can match this
3526                      * ss in the pattern, but they remain EXACTF nodes, as they
3527                      * won't match this unless the target string is is UTF-8,
3528                      * which we don't know until runtime.  EXACTFL nodes can't
3529                      * transform into EXACTFU nodes */
3530                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3531                         OP(scan) = EXACTFU_SS;
3532                     }
3533                 }
3534
3535                 *min_subtract += len - 1;
3536                 s += len;
3537             }
3538         }
3539     }
3540
3541 #ifdef DEBUGGING
3542     /* Allow dumping but overwriting the collection of skipped
3543      * ops and/or strings with fake optimized ops */
3544     n = scan + NODE_SZ_STR(scan);
3545     while (n <= stop) {
3546         OP(n) = OPTIMIZED;
3547         FLAGS(n) = 0;
3548         NEXT_OFF(n) = 0;
3549         n++;
3550     }
3551 #endif
3552     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3553     return stopnow;
3554 }
3555
3556 /* REx optimizer.  Converts nodes into quicker variants "in place".
3557    Finds fixed substrings.  */
3558
3559 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3560    to the position after last scanned or to NULL. */
3561
3562 #define INIT_AND_WITHP \
3563     assert(!and_withp); \
3564     Newx(and_withp,1, regnode_ssc); \
3565     SAVEFREEPV(and_withp)
3566
3567 /* this is a chain of data about sub patterns we are processing that
3568    need to be handled separately/specially in study_chunk. Its so
3569    we can simulate recursion without losing state.  */
3570 struct scan_frame;
3571 typedef struct scan_frame {
3572     regnode *last;  /* last node to process in this frame */
3573     regnode *next;  /* next node to process when last is reached */
3574     struct scan_frame *prev; /*previous frame*/
3575     U32 prev_recursed_depth;
3576     I32 stop; /* what stopparen do we use */
3577 } scan_frame;
3578
3579
3580 STATIC SSize_t
3581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3582                         SSize_t *minlenp, SSize_t *deltap,
3583                         regnode *last,
3584                         scan_data_t *data,
3585                         I32 stopparen,
3586                         U32 recursed_depth,
3587                         regnode_ssc *and_withp,
3588                         U32 flags, U32 depth)
3589                         /* scanp: Start here (read-write). */
3590                         /* deltap: Write maxlen-minlen here. */
3591                         /* last: Stop before this one. */
3592                         /* data: string data about the pattern */
3593                         /* stopparen: treat close N as END */
3594                         /* recursed: which subroutines have we recursed into */
3595                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3596 {
3597     dVAR;
3598     /* There must be at least this number of characters to match */
3599     SSize_t min = 0;
3600     I32 pars = 0, code;
3601     regnode *scan = *scanp, *next;
3602     SSize_t delta = 0;
3603     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3604     int is_inf_internal = 0;            /* The studied chunk is infinite */
3605     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3606     scan_data_t data_fake;
3607     SV *re_trie_maxbuff = NULL;
3608     regnode *first_non_open = scan;
3609     SSize_t stopmin = SSize_t_MAX;
3610     scan_frame *frame = NULL;
3611     GET_RE_DEBUG_FLAGS_DECL;
3612
3613     PERL_ARGS_ASSERT_STUDY_CHUNK;
3614
3615 #ifdef DEBUGGING
3616     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3617 #endif
3618     if ( depth == 0 ) {
3619         while (first_non_open && OP(first_non_open) == OPEN)
3620             first_non_open=regnext(first_non_open);
3621     }
3622
3623
3624   fake_study_recurse:
3625     while ( scan && OP(scan) != END && scan < last ){
3626         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3627                                    node length to get a real minimum (because
3628                                    the folded version may be shorter) */
3629         bool unfolded_multi_char = FALSE;
3630         /* Peephole optimizer: */
3631         DEBUG_OPTIMISE_MORE_r(
3632         {
3633             PerlIO_printf(Perl_debug_log,
3634                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3635                 ((int) depth*2), "", (long)stopparen,
3636                 (unsigned long)depth, (unsigned long)recursed_depth);
3637             if (recursed_depth) {
3638                 U32 i;
3639                 U32 j;
3640                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3641                     PerlIO_printf(Perl_debug_log,"[");
3642                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3643                         PerlIO_printf(Perl_debug_log,"%d",
3644                             PAREN_TEST(RExC_study_chunk_recursed +
3645                                        (j * RExC_study_chunk_recursed_bytes), i)
3646                             ? 1 : 0
3647                         );
3648                     PerlIO_printf(Perl_debug_log,"]");
3649                 }
3650             }
3651             PerlIO_printf(Perl_debug_log,"\n");
3652         }
3653         );
3654         DEBUG_STUDYDATA("Peep:", data, depth);
3655         DEBUG_PEEP("Peep", scan, depth);
3656
3657
3658         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3659          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3660          * by a different invocation of reg() -- Yves
3661          */
3662         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3663
3664         /* Follow the next-chain of the current node and optimize
3665            away all the NOTHINGs from it.  */
3666         if (OP(scan) != CURLYX) {
3667             const int max = (reg_off_by_arg[OP(scan)]
3668                        ? I32_MAX
3669                        /* I32 may be smaller than U16 on CRAYs! */
3670                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3671             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3672             int noff;
3673             regnode *n = scan;
3674
3675             /* Skip NOTHING and LONGJMP. */
3676             while ((n = regnext(n))
3677                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3678                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3679                    && off + noff < max)
3680                 off += noff;
3681             if (reg_off_by_arg[OP(scan)])
3682                 ARG(scan) = off;
3683             else
3684                 NEXT_OFF(scan) = off;
3685         }
3686
3687
3688
3689         /* The principal pseudo-switch.  Cannot be a switch, since we
3690            look into several different things.  */
3691         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3692                    || OP(scan) == IFTHEN) {
3693             next = regnext(scan);
3694             code = OP(scan);
3695             /* demq: the op(next)==code check is to see if we have
3696              * "branch-branch" AFAICT */
3697
3698             if (OP(next) == code || code == IFTHEN) {
3699                 /* NOTE - There is similar code to this block below for
3700                  * handling TRIE nodes on a re-study.  If you change stuff here
3701                  * check there too. */
3702                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3703                 regnode_ssc accum;
3704                 regnode * const startbranch=scan;
3705
3706                 if (flags & SCF_DO_SUBSTR) {
3707                     /* Cannot merge strings after this. */
3708                     scan_commit(pRExC_state, data, minlenp, is_inf);
3709                 }
3710
3711                 if (flags & SCF_DO_STCLASS)
3712                     ssc_init_zero(pRExC_state, &accum);
3713
3714                 while (OP(scan) == code) {
3715                     SSize_t deltanext, minnext, fake;
3716                     I32 f = 0;
3717                     regnode_ssc this_class;
3718
3719                     num++;
3720                     data_fake.flags = 0;
3721                     if (data) {
3722                         data_fake.whilem_c = data->whilem_c;
3723                         data_fake.last_closep = data->last_closep;
3724                     }
3725                     else
3726                         data_fake.last_closep = &fake;
3727
3728                     data_fake.pos_delta = delta;
3729                     next = regnext(scan);
3730                     scan = NEXTOPER(scan);
3731                     if (code != BRANCH)
3732                         scan = NEXTOPER(scan);
3733                     if (flags & SCF_DO_STCLASS) {
3734                         ssc_init(pRExC_state, &this_class);
3735                         data_fake.start_class = &this_class;
3736                         f = SCF_DO_STCLASS_AND;
3737                     }
3738                     if (flags & SCF_WHILEM_VISITED_POS)
3739                         f |= SCF_WHILEM_VISITED_POS;
3740
3741                     /* we suppose the run is continuous, last=next...*/
3742                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3743                                       &deltanext, next, &data_fake, stopparen,
3744                                       recursed_depth, NULL, f,depth+1);
3745                     if (min1 > minnext)
3746                         min1 = minnext;
3747                     if (deltanext == SSize_t_MAX) {
3748                         is_inf = is_inf_internal = 1;
3749                         max1 = SSize_t_MAX;
3750                     } else if (max1 < minnext + deltanext)
3751                         max1 = minnext + deltanext;
3752                     scan = next;
3753                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3754                         pars++;
3755                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3756                         if ( stopmin > minnext)
3757                             stopmin = min + min1;
3758                         flags &= ~SCF_DO_SUBSTR;
3759                         if (data)
3760                             data->flags |= SCF_SEEN_ACCEPT;
3761                     }
3762                     if (data) {
3763                         if (data_fake.flags & SF_HAS_EVAL)
3764                             data->flags |= SF_HAS_EVAL;
3765                         data->whilem_c = data_fake.whilem_c;
3766                     }
3767                     if (flags & SCF_DO_STCLASS)
3768                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3769                 }
3770                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3771                     min1 = 0;
3772                 if (flags & SCF_DO_SUBSTR) {
3773                     data->pos_min += min1;
3774                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3775                         data->pos_delta = SSize_t_MAX;
3776                     else
3777                         data->pos_delta += max1 - min1;
3778                     if (max1 != min1 || is_inf)
3779                         data->longest = &(data->longest_float);
3780                 }
3781                 min += min1;
3782                 if (delta == SSize_t_MAX
3783                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3784                     delta = SSize_t_MAX;
3785                 else
3786                     delta += max1 - min1;
3787                 if (flags & SCF_DO_STCLASS_OR) {
3788                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3789                     if (min1) {
3790                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3791                         flags &= ~SCF_DO_STCLASS;
3792                     }
3793                 }
3794                 else if (flags & SCF_DO_STCLASS_AND) {
3795                     if (min1) {
3796                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3797                         flags &= ~SCF_DO_STCLASS;
3798                     }
3799                     else {
3800                         /* Switch to OR mode: cache the old value of
3801                          * data->start_class */
3802                         INIT_AND_WITHP;
3803                         StructCopy(data->start_class, and_withp, regnode_ssc);
3804                         flags &= ~SCF_DO_STCLASS_AND;
3805                         StructCopy(&accum, data->start_class, regnode_ssc);
3806                         flags |= SCF_DO_STCLASS_OR;
3807                     }
3808                 }
3809
3810                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3811                         OP( startbranch ) == BRANCH )
3812                 {
3813                 /* demq.
3814
3815                    Assuming this was/is a branch we are dealing with: 'scan'
3816                    now points at the item that follows the branch sequence,
3817                    whatever it is. We now start at the beginning of the
3818                    sequence and look for subsequences of
3819
3820                    BRANCH->EXACT=>x1
3821                    BRANCH->EXACT=>x2
3822                    tail
3823
3824                    which would be constructed from a pattern like
3825                    /A|LIST|OF|WORDS/
3826
3827                    If we can find such a subsequence we need to turn the first
3828                    element into a trie and then add the subsequent branch exact
3829                    strings to the trie.
3830
3831                    We have two cases
3832
3833                      1. patterns where the whole set of branches can be
3834                         converted.
3835
3836                      2. patterns where only a subset can be converted.
3837
3838                    In case 1 we can replace the whole set with a single regop
3839                    for the trie. In case 2 we need to keep the start and end
3840                    branches so
3841
3842                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3843                      becomes BRANCH TRIE; BRANCH X;
3844
3845                   There is an additional case, that being where there is a
3846                   common prefix, which gets split out into an EXACT like node
3847                   preceding the TRIE node.
3848
3849                   If x(1..n)==tail then we can do a simple trie, if not we make
3850                   a "jump" trie, such that when we match the appropriate word
3851                   we "jump" to the appropriate tail node. Essentially we turn
3852                   a nested if into a case structure of sorts.
3853
3854                 */
3855
3856                     int made=0;
3857                     if (!re_trie_maxbuff) {
3858                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3859                         if (!SvIOK(re_trie_maxbuff))
3860                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3861                     }
3862                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3863                         regnode *cur;
3864                         regnode *first = (regnode *)NULL;
3865                         regnode *last = (regnode *)NULL;
3866                         regnode *tail = scan;
3867                         U8 trietype = 0;
3868                         U32 count=0;
3869
3870 #ifdef DEBUGGING
3871                         SV * const mysv = sv_newmortal();   /* for dumping */
3872 #endif
3873                         /* var tail is used because there may be a TAIL
3874                            regop in the way. Ie, the exacts will point to the
3875                            thing following the TAIL, but the last branch will
3876                            point at the TAIL. So we advance tail. If we
3877                            have nested (?:) we may have to move through several
3878                            tails.
3879                          */
3880
3881                         while ( OP( tail ) == TAIL ) {
3882                             /* this is the TAIL generated by (?:) */
3883                             tail = regnext( tail );
3884                         }
3885
3886
3887                         DEBUG_TRIE_COMPILE_r({
3888                             regprop(RExC_rx, mysv, tail, NULL);
3889                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3890                               (int)depth * 2 + 2, "",
3891                               "Looking for TRIE'able sequences. Tail node is: ",
3892                               SvPV_nolen_const( mysv )
3893                             );
3894                         });
3895
3896                         /*
3897
3898                             Step through the branches
3899                                 cur represents each branch,
3900                                 noper is the first thing to be matched as part
3901                                       of that branch
3902                                 noper_next is the regnext() of that node.
3903
3904                             We normally handle a case like this
3905                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3906                             support building with NOJUMPTRIE, which restricts
3907                             the trie logic to structures like /FOO|BAR/.
3908
3909                             If noper is a trieable nodetype then the branch is
3910                             a possible optimization target. If we are building
3911                             under NOJUMPTRIE then we require that noper_next is
3912                             the same as scan (our current position in the regex
3913                             program).
3914
3915                             Once we have two or more consecutive such branches
3916                             we can create a trie of the EXACT's contents and
3917                             stitch it in place into the program.
3918
3919                             If the sequence represents all of the branches in
3920                             the alternation we replace the entire thing with a
3921                             single TRIE node.
3922
3923                             Otherwise when it is a subsequence we need to
3924                             stitch it in place and replace only the relevant
3925                             branches. This means the first branch has to remain
3926                             as it is used by the alternation logic, and its
3927                             next pointer, and needs to be repointed at the item
3928                             on the branch chain following the last branch we
3929                             have optimized away.
3930
3931                             This could be either a BRANCH, in which case the
3932                             subsequence is internal, or it could be the item
3933                             following the branch sequence in which case the
3934                             subsequence is at the end (which does not
3935                             necessarily mean the first node is the start of the
3936                             alternation).
3937
3938                             TRIE_TYPE(X) is a define which maps the optype to a
3939                             trietype.
3940
3941                                 optype          |  trietype
3942                                 ----------------+-----------
3943                                 NOTHING         | NOTHING
3944                                 EXACT           | EXACT
3945                                 EXACTFU         | EXACTFU
3946                                 EXACTFU_SS      | EXACTFU
3947                                 EXACTFA         | EXACTFA
3948
3949
3950                         */
3951 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3952                        ( EXACT == (X) )   ? EXACT :        \
3953                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3954                        ( EXACTFA == (X) ) ? EXACTFA :        \
3955                        0 )
3956
3957                         /* dont use tail as the end marker for this traverse */
3958                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3959                             regnode * const noper = NEXTOPER( cur );
3960                             U8 noper_type = OP( noper );
3961                             U8 noper_trietype = TRIE_TYPE( noper_type );
3962 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3963                             regnode * const noper_next = regnext( noper );
3964                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3965                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3966 #endif
3967
3968                             DEBUG_TRIE_COMPILE_r({
3969                                 regprop(RExC_rx, mysv, cur, NULL);
3970                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3971                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3972
3973                                 regprop(RExC_rx, mysv, noper, NULL);
3974                                 PerlIO_printf( Perl_debug_log, " -> %s",
3975                                     SvPV_nolen_const(mysv));
3976
3977                                 if ( noper_next ) {
3978                                   regprop(RExC_rx, mysv, noper_next, NULL);
3979                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3980                                     SvPV_nolen_const(mysv));
3981                                 }
3982                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3983                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3984                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3985                                 );
3986                             });
3987
3988                             /* Is noper a trieable nodetype that can be merged
3989                              * with the current trie (if there is one)? */
3990                             if ( noper_trietype
3991                                   &&
3992                                   (
3993                                         ( noper_trietype == NOTHING)
3994                                         || ( trietype == NOTHING )
3995                                         || ( trietype == noper_trietype )
3996                                   )
3997 #ifdef NOJUMPTRIE
3998                                   && noper_next == tail
3999 #endif
4000                                   && count < U16_MAX)
4001                             {
4002                                 /* Handle mergable triable node Either we are
4003                                  * the first node in a new trieable sequence,
4004                                  * in which case we do some bookkeeping,
4005                                  * otherwise we update the end pointer. */
4006                                 if ( !first ) {
4007                                     first = cur;
4008                                     if ( noper_trietype == NOTHING ) {
4009 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4010                                         regnode * const noper_next = regnext( noper );
4011                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4012                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4013 #endif
4014
4015                                         if ( noper_next_trietype ) {
4016                                             trietype = noper_next_trietype;
4017                                         } else if (noper_next_type)  {
4018                                             /* a NOTHING regop is 1 regop wide.
4019                                              * We need at least two for a trie
4020                                              * so we can't merge this in */
4021                                             first = NULL;
4022                                         }
4023                                     } else {
4024                                         trietype = noper_trietype;
4025                                     }
4026                                 } else {
4027                                     if ( trietype == NOTHING )
4028                                         trietype = noper_trietype;
4029                                     last = cur;
4030                                 }
4031                                 if (first)
4032                                     count++;
4033                             } /* end handle mergable triable node */
4034                             else {
4035                                 /* handle unmergable node -
4036                                  * noper may either be a triable node which can
4037                                  * not be tried together with the current trie,
4038                                  * or a non triable node */
4039                                 if ( last ) {
4040                                     /* If last is set and trietype is not
4041                                      * NOTHING then we have found at least two
4042                                      * triable branch sequences in a row of a
4043                                      * similar trietype so we can turn them
4044                                      * into a trie. If/when we allow NOTHING to
4045                                      * start a trie sequence this condition
4046                                      * will be required, and it isn't expensive
4047                                      * so we leave it in for now. */
4048                                     if ( trietype && trietype != NOTHING )
4049                                         make_trie( pRExC_state,
4050                                                 startbranch, first, cur, tail,
4051                                                 count, trietype, depth+1 );
4052                                     last = NULL; /* note: we clear/update
4053                                                     first, trietype etc below,
4054                                                     so we dont do it here */
4055                                 }
4056                                 if ( noper_trietype
4057 #ifdef NOJUMPTRIE
4058                                      && noper_next == tail
4059 #endif
4060                                 ){
4061                                     /* noper is triable, so we can start a new
4062                                      * trie sequence */
4063                                     count = 1;
4064                                     first = cur;
4065                                     trietype = noper_trietype;
4066                                 } else if (first) {
4067                                     /* if we already saw a first but the
4068                                      * current node is not triable then we have
4069                                      * to reset the first information. */
4070                                     count = 0;
4071                                     first = NULL;
4072                                     trietype = 0;
4073                                 }
4074                             } /* end handle unmergable node */
4075                         } /* loop over branches */
4076                         DEBUG_TRIE_COMPILE_r({
4077                             regprop(RExC_rx, mysv, cur, NULL);
4078                             PerlIO_printf( Perl_debug_log,
4079                               "%*s- %s (%d) <SCAN FINISHED>\n",
4080                               (int)depth * 2 + 2,
4081                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4082
4083                         });
4084                         if ( last && trietype ) {
4085                             if ( trietype != NOTHING ) {
4086                                 /* the last branch of the sequence was part of
4087                                  * a trie, so we have to construct it here
4088                                  * outside of the loop */
4089                                 made= make_trie( pRExC_state, startbranch,
4090                                                  first, scan, tail, count,
4091                                                  trietype, depth+1 );
4092 #ifdef TRIE_STUDY_OPT
4093                                 if ( ((made == MADE_EXACT_TRIE &&
4094                                      startbranch == first)
4095                                      || ( first_non_open == first )) &&
4096                                      depth==0 ) {
4097                                     flags |= SCF_TRIE_RESTUDY;
4098                                     if ( startbranch == first
4099                                          && scan == tail )
4100                                     {
4101                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4102                                     }
4103                                 }
4104 #endif
4105                             } else {
4106                                 /* at this point we know whatever we have is a
4107                                  * NOTHING sequence/branch AND if 'startbranch'
4108                                  * is 'first' then we can turn the whole thing
4109                                  * into a NOTHING
4110                                  */
4111                                 if ( startbranch == first ) {
4112                                     regnode *opt;
4113                                     /* the entire thing is a NOTHING sequence,
4114                                      * something like this: (?:|) So we can
4115                                      * turn it into a plain NOTHING op. */
4116                                     DEBUG_TRIE_COMPILE_r({
4117                                         regprop(RExC_rx, mysv, cur, NULL);
4118                                         PerlIO_printf( Perl_debug_log,
4119                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4120                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4121
4122                                     });
4123                                     OP(startbranch)= NOTHING;
4124                                     NEXT_OFF(startbranch)= tail - startbranch;
4125                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4126                                         OP(opt)= OPTIMIZED;
4127                                 }
4128                             }
4129                         } /* end if ( last) */
4130                     } /* TRIE_MAXBUF is non zero */
4131
4132                 } /* do trie */
4133
4134             }
4135             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4136                 scan = NEXTOPER(NEXTOPER(scan));
4137             } else                      /* single branch is optimized. */
4138                 scan = NEXTOPER(scan);
4139             continue;
4140         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4141             scan_frame *newframe = NULL;
4142             I32 paren;
4143             regnode *start;
4144             regnode *end;
4145             U32 my_recursed_depth= recursed_depth;
4146
4147             if (OP(scan) != SUSPEND) {
4148                 /* set the pointer */
4149                 if (OP(scan) == GOSUB) {
4150                     paren = ARG(scan);
4151                     RExC_recurse[ARG2L(scan)] = scan;
4152                     start = RExC_open_parens[paren-1];
4153                     end   = RExC_close_parens[paren-1];
4154                 } else {
4155                     paren = 0;
4156                     start = RExC_rxi->program + 1;
4157                     end   = RExC_opend;
4158                 }
4159                 if (!recursed_depth
4160                     ||
4161                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4162                 ) {
4163                     if (!recursed_depth) {
4164                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4165                     } else {
4166                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4167                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4168                              RExC_study_chunk_recursed_bytes, U8);
4169                     }
4170                     /* we havent recursed into this paren yet, so recurse into it */
4171                     DEBUG_STUDYDATA("set:", data,depth);
4172                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4173                     my_recursed_depth= recursed_depth + 1;
4174                     Newx(newframe,1,scan_frame);
4175                 } else {
4176                     DEBUG_STUDYDATA("inf:", data,depth);
4177                     /* some form of infinite recursion, assume infinite length
4178                      * */
4179                     if (flags & SCF_DO_SUBSTR) {
4180                         scan_commit(pRExC_state, data, minlenp, is_inf);
4181                         data->longest = &(data->longest_float);
4182                     }
4183                     is_inf = is_inf_internal = 1;
4184                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4185                         ssc_anything(data->start_class);
4186                     flags &= ~SCF_DO_STCLASS;
4187                 }
4188             } else {
4189                 Newx(newframe,1,scan_frame);
4190                 paren = stopparen;
4191                 start = scan+2;
4192                 end = regnext(scan);
4193             }
4194             if (newframe) {
4195                 assert(start);
4196                 assert(end);
4197                 SAVEFREEPV(newframe);
4198                 newframe->next = regnext(scan);
4199                 newframe->last = last;
4200                 newframe->stop = stopparen;
4201                 newframe->prev = frame;
4202                 newframe->prev_recursed_depth = recursed_depth;
4203
4204                 DEBUG_STUDYDATA("frame-new:",data,depth);
4205                 DEBUG_PEEP("fnew", scan, depth);
4206
4207                 frame = newframe;
4208                 scan =  start;
4209                 stopparen = paren;
4210                 last = end;
4211                 depth = depth + 1;
4212                 recursed_depth= my_recursed_depth;
4213
4214                 continue;
4215             }
4216         }
4217         else if (OP(scan) == EXACT) {
4218             SSize_t l = STR_LEN(scan);
4219             UV uc;
4220             if (UTF) {
4221                 const U8 * const s = (U8*)STRING(scan);
4222                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4223                 l = utf8_length(s, s + l);
4224             } else {
4225                 uc = *((U8*)STRING(scan));
4226             }
4227             min += l;
4228             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4229                 /* The code below prefers earlier match for fixed
4230                    offset, later match for variable offset.  */
4231                 if (data->last_end == -1) { /* Update the start info. */
4232                     data->last_start_min = data->pos_min;
4233                     data->last_start_max = is_inf
4234                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4235                 }
4236                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4237                 if (UTF)
4238                     SvUTF8_on(data->last_found);
4239                 {
4240                     SV * const sv = data->last_found;
4241                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4242                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4243                     if (mg && mg->mg_len >= 0)
4244                         mg->mg_len += utf8_length((U8*)STRING(scan),
4245                                               (U8*)STRING(scan)+STR_LEN(scan));
4246                 }
4247                 data->last_end = data->pos_min + l;
4248                 data->pos_min += l; /* As in the first entry. */
4249                 data->flags &= ~SF_BEFORE_EOL;
4250             }
4251
4252             /* ANDing the code point leaves at most it, and not in locale, and
4253              * can't match null string */
4254             if (flags & SCF_DO_STCLASS_AND) {
4255                 ssc_cp_and(data->start_class, uc);
4256                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4257                 ssc_clear_locale(data->start_class);
4258             }
4259             else if (flags & SCF_DO_STCLASS_OR) {
4260                 ssc_add_cp(data->start_class, uc);
4261                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4262
4263                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4264                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4265             }
4266             flags &= ~SCF_DO_STCLASS;
4267         }
4268         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4269                                                      EXACTFish */
4270             SSize_t l = STR_LEN(scan);
4271             UV uc = *((U8*)STRING(scan));
4272             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4273                                                      separate code points */
4274             const U8 * s = (U8*)STRING(scan);
4275
4276             /* Search for fixed substrings supports EXACT only. */
4277             if (flags & SCF_DO_SUBSTR) {
4278                 assert(data);
4279                 scan_commit(pRExC_state, data, minlenp, is_inf);
4280             }
4281             if (UTF) {
4282                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4283                 l = utf8_length(s, s + l);
4284             }
4285             if (unfolded_multi_char) {
4286                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4287             }
4288             min += l - min_subtract;
4289             assert (min >= 0);
4290             delta += min_subtract;
4291             if (flags & SCF_DO_SUBSTR) {
4292                 data->pos_min += l - min_subtract;
4293                 if (data->pos_min < 0) {
4294                     data->pos_min = 0;
4295                 }
4296                 data->pos_delta += min_subtract;
4297                 if (min_subtract) {
4298                     data->longest = &(data->longest_float);
4299                 }
4300             }
4301
4302             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4303                 ssc_clear_locale(data->start_class);
4304             }
4305
4306             if (! UTF) {
4307
4308                 /* We punt and assume can match anything if the node begins
4309                  * with a multi-character fold.  Things are complicated.  For
4310                  * example, /ffi/i could match any of:
4311                  *  "\N{LATIN SMALL LIGATURE FFI}"
4312                  *  "\N{LATIN SMALL LIGATURE FF}I"
4313                  *  "F\N{LATIN SMALL LIGATURE FI}"
4314                  *  plus several other things; and making sure we have all the
4315                  *  possibilities is hard. */
4316                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4317                     EXACTF_invlist =
4318                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4319                 }
4320                 else {
4321
4322                     /* Any Latin1 range character can potentially match any
4323                      * other depending on the locale */
4324                     if (OP(scan) == EXACTFL) {
4325                         _invlist_union(EXACTF_invlist, PL_Latin1,
4326                                                               &EXACTF_invlist);
4327                     }
4328                     else {
4329                         /* But otherwise, it matches at least itself.  We can
4330                          * quickly tell if it has a distinct fold, and if so,
4331                          * it matches that as well */
4332                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4333                         if (IS_IN_SOME_FOLD_L1(uc)) {
4334                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4335                                                            PL_fold_latin1[uc]);
4336                         }
4337                     }
4338
4339                     /* Some characters match above-Latin1 ones under /i.  This
4340                      * is true of EXACTFL ones when the locale is UTF-8 */
4341                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4342                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4343                                             && OP(scan) != EXACTFA_NO_TRIE)))
4344                     {
4345                         add_above_Latin1_folds(pRExC_state,
4346                                                (U8) uc,
4347                                                &EXACTF_invlist);
4348                     }
4349                 }
4350             }
4351             else {  /* Pattern is UTF-8 */
4352                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4353                 STRLEN foldlen = UTF8SKIP(s);
4354                 const U8* e = s + STR_LEN(scan);
4355                 SV** listp;
4356
4357                 /* The only code points that aren't folded in a UTF EXACTFish
4358                  * node are are the problematic ones in EXACTFL nodes */
4359                 if (OP(scan) == EXACTFL
4360                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4361                 {
4362                     /* We need to check for the possibility that this EXACTFL
4363                      * node begins with a multi-char fold.  Therefore we fold
4364                      * the first few characters of it so that we can make that
4365                      * check */
4366                     U8 *d = folded;
4367                     int i;
4368
4369                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4370                         if (isASCII(*s)) {
4371                             *(d++) = (U8) toFOLD(*s);
4372                             s++;
4373                         }
4374                         else {
4375                             STRLEN len;
4376                             to_utf8_fold(s, d, &len);
4377                             d += len;
4378                             s += UTF8SKIP(s);
4379                         }
4380                     }
4381
4382                     /* And set up so the code below that looks in this folded
4383                      * buffer instead of the node's string */
4384                     e = d;
4385                     foldlen = UTF8SKIP(folded);
4386                     s = folded;
4387                 }
4388
4389                 /* When we reach here 's' points to the fold of the first
4390                  * character(s) of the node; and 'e' points to far enough along
4391                  * the folded string to be just past any possible multi-char
4392                  * fold. 'foldlen' is the length in bytes of the first
4393                  * character in 's'
4394                  *
4395                  * Unlike the non-UTF-8 case, the macro for determining if a
4396                  * string is a multi-char fold requires all the characters to
4397                  * already be folded.  This is because of all the complications
4398                  * if not.  Note that they are folded anyway, except in EXACTFL
4399                  * nodes.  Like the non-UTF case above, we punt if the node
4400                  * begins with a multi-char fold  */
4401
4402                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4403                     EXACTF_invlist =
4404                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4405                 }
4406                 else {  /* Single char fold */
4407
4408                     /* It matches all the things that fold to it, which are
4409                      * found in PL_utf8_foldclosures (including itself) */
4410                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4411                     if (! PL_utf8_foldclosures) {
4412                         _load_PL_utf8_foldclosures();
4413                     }
4414                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4415                                         (char *) s, foldlen, FALSE)))
4416                     {
4417                         AV* list = (AV*) *listp;
4418                         IV k;
4419                         for (k = 0; k <= av_tindex(list); k++) {
4420                             SV** c_p = av_fetch(list, k, FALSE);
4421                             UV c;
4422                             assert(c_p);
4423
4424                             c = SvUV(*c_p);
4425
4426                             /* /aa doesn't allow folds between ASCII and non- */
4427                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4428                                 && isASCII(c) != isASCII(uc))
4429                             {
4430                                 continue;
4431                             }
4432
4433                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4434                         }
4435                     }
4436                 }
4437             }
4438             if (flags & SCF_DO_STCLASS_AND) {
4439                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4440                 ANYOF_POSIXL_ZERO(data->start_class);
4441                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4442             }
4443             else if (flags & SCF_DO_STCLASS_OR) {
4444                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4445                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4446
4447                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4448                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4449             }
4450             flags &= ~SCF_DO_STCLASS;
4451             SvREFCNT_dec(EXACTF_invlist);
4452         }
4453         else if (REGNODE_VARIES(OP(scan))) {
4454             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4455             I32 fl = 0, f = flags;
4456             regnode * const oscan = scan;
4457             regnode_ssc this_class;
4458             regnode_ssc *oclass = NULL;
4459             I32 next_is_eval = 0;
4460
4461             switch (PL_regkind[OP(scan)]) {
4462             case WHILEM:                /* End of (?:...)* . */
4463                 scan = NEXTOPER(scan);
4464                 goto finish;
4465             case PLUS:
4466                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4467                     next = NEXTOPER(scan);
4468                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4469                         mincount = 1;
4470                         maxcount = REG_INFTY;
4471                         next = regnext(scan);
4472                         scan = NEXTOPER(scan);
4473                         goto do_curly;
4474                     }
4475                 }
4476                 if (flags & SCF_DO_SUBSTR)
4477                     data->pos_min++;
4478                 min++;
4479                 /* FALLTHROUGH */
4480             case STAR:
4481                 if (flags & SCF_DO_STCLASS) {
4482                     mincount = 0;
4483                     maxcount = REG_INFTY;
4484                     next = regnext(scan);
4485                     scan = NEXTOPER(scan);
4486                     goto do_curly;
4487                 }
4488                 if (flags & SCF_DO_SUBSTR) {
4489                     scan_commit(pRExC_state, data, minlenp, is_inf);
4490                     /* Cannot extend fixed substrings */
4491                     data->longest = &(data->longest_float);
4492                 }
4493                 is_inf = is_inf_internal = 1;
4494                 scan = regnext(scan);
4495                 goto optimize_curly_tail;
4496             case CURLY:
4497                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4498                     && (scan->flags == stopparen))
4499                 {
4500                     mincount = 1;
4501                     maxcount = 1;
4502                 } else {
4503                     mincount = ARG1(scan);
4504                     maxcount = ARG2(scan);
4505                 }
4506                 next = regnext(scan);
4507                 if (OP(scan) == CURLYX) {
4508                     I32 lp = (data ? *(data->last_closep) : 0);
4509                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4510                 }
4511                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4512                 next_is_eval = (OP(scan) == EVAL);
4513               do_curly:
4514                 if (flags & SCF_DO_SUBSTR) {
4515                     if (mincount == 0)
4516                         scan_commit(pRExC_state, data, minlenp, is_inf);
4517                     /* Cannot extend fixed substrings */
4518                     pos_before = data->pos_min;
4519                 }
4520                 if (data) {
4521                     fl = data->flags;
4522                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4523                     if (is_inf)
4524                         data->flags |= SF_IS_INF;
4525                 }
4526                 if (flags & SCF_DO_STCLASS) {
4527                     ssc_init(pRExC_state, &this_class);
4528                     oclass = data->start_class;
4529                     data->start_class = &this_class;
4530                     f |= SCF_DO_STCLASS_AND;
4531                     f &= ~SCF_DO_STCLASS_OR;
4532                 }
4533                 /* Exclude from super-linear cache processing any {n,m}
4534                    regops for which the combination of input pos and regex
4535                    pos is not enough information to determine if a match
4536                    will be possible.
4537
4538                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4539                    regex pos at the \s*, the prospects for a match depend not
4540                    only on the input position but also on how many (bar\s*)
4541                    repeats into the {4,8} we are. */
4542                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4543                     f &= ~SCF_WHILEM_VISITED_POS;
4544
4545                 /* This will finish on WHILEM, setting scan, or on NULL: */
4546                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4547                                   last, data, stopparen, recursed_depth, NULL,
4548                                   (mincount == 0
4549                                    ? (f & ~SCF_DO_SUBSTR)
4550                                    : f)
4551                                   ,depth+1);
4552
4553                 if (flags & SCF_DO_STCLASS)
4554                     data->start_class = oclass;
4555                 if (mincount == 0 || minnext == 0) {
4556                     if (flags & SCF_DO_STCLASS_OR) {
4557                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4558                     }
4559                     else if (flags & SCF_DO_STCLASS_AND) {
4560                         /* Switch to OR mode: cache the old value of
4561                          * data->start_class */
4562                         INIT_AND_WITHP;
4563                         StructCopy(data->start_class, and_withp, regnode_ssc);
4564                         flags &= ~SCF_DO_STCLASS_AND;
4565                         StructCopy(&this_class, data->start_class, regnode_ssc);
4566                         flags |= SCF_DO_STCLASS_OR;
4567                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4568                     }
4569                 } else {                /* Non-zero len */
4570                     if (flags & SCF_DO_STCLASS_OR) {
4571                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4572                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4573                     }
4574                     else if (flags & SCF_DO_STCLASS_AND)
4575                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4576                     flags &= ~SCF_DO_STCLASS;
4577                 }
4578                 if (!scan)              /* It was not CURLYX, but CURLY. */
4579                     scan = next;
4580                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4581                     /* ? quantifier ok, except for (?{ ... }) */
4582                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4583                     && (minnext == 0) && (deltanext == 0)
4584                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4585                     && maxcount <= REG_INFTY/3) /* Complement check for big
4586                                                    count */
4587                 {
4588                     /* Fatal warnings may leak the regexp without this: */
4589                     SAVEFREESV(RExC_rx_sv);
4590                     ckWARNreg(RExC_parse,
4591                             "Quantifier unexpected on zero-length expression");
4592                     (void)ReREFCNT_inc(RExC_rx_sv);
4593                 }
4594
4595                 min += minnext * mincount;
4596                 is_inf_internal |= deltanext == SSize_t_MAX
4597                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4598                 is_inf |= is_inf_internal;
4599                 if (is_inf) {
4600                     delta = SSize_t_MAX;
4601                 } else {
4602                     delta += (minnext + deltanext) * maxcount
4603                              - minnext * mincount;
4604                 }
4605                 /* Try powerful optimization CURLYX => CURLYN. */
4606                 if (  OP(oscan) == CURLYX && data
4607                       && data->flags & SF_IN_PAR
4608                       && !(data->flags & SF_HAS_EVAL)
4609                       && !deltanext && minnext == 1 ) {
4610                     /* Try to optimize to CURLYN.  */
4611                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4612                     regnode * const nxt1 = nxt;
4613 #ifdef DEBUGGING
4614                     regnode *nxt2;
4615 #endif
4616
4617                     /* Skip open. */
4618                     nxt = regnext(nxt);
4619                     if (!REGNODE_SIMPLE(OP(nxt))
4620                         && !(PL_regkind[OP(nxt)] == EXACT
4621                              && STR_LEN(nxt) == 1))
4622                         goto nogo;
4623 #ifdef DEBUGGING
4624                     nxt2 = nxt;
4625 #endif
4626                     nxt = regnext(nxt);
4627                     if (OP(nxt) != CLOSE)
4628                         goto nogo;
4629                     if (RExC_open_parens) {
4630                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4631                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4632                     }
4633                     /* Now we know that nxt2 is the only contents: */
4634                     oscan->flags = (U8)ARG(nxt);
4635                     OP(oscan) = CURLYN;
4636                     OP(nxt1) = NOTHING; /* was OPEN. */
4637
4638 #ifdef DEBUGGING
4639                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4640                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4641                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4642                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4643                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4644                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4645 #endif
4646                 }
4647               nogo:
4648
4649                 /* Try optimization CURLYX => CURLYM. */
4650                 if (  OP(oscan) == CURLYX && data
4651                       && !(data->flags & SF_HAS_PAR)
4652                       && !(data->flags & SF_HAS_EVAL)
4653                       && !deltanext     /* atom is fixed width */
4654                       && minnext != 0   /* CURLYM can't handle zero width */
4655
4656                          /* Nor characters whose fold at run-time may be
4657                           * multi-character */
4658                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4659                 ) {
4660                     /* XXXX How to optimize if data == 0? */
4661                     /* Optimize to a simpler form.  */
4662                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4663                     regnode *nxt2;
4664
4665                     OP(oscan) = CURLYM;
4666                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4667                             && (OP(nxt2) != WHILEM))
4668                         nxt = nxt2;
4669                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4670                     /* Need to optimize away parenths. */
4671                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4672                         /* Set the parenth number.  */
4673                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4674
4675                         oscan->flags = (U8)ARG(nxt);
4676                         if (RExC_open_parens) {
4677                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4678                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4679                         }
4680                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4681                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4682
4683 #ifdef DEBUGGING
4684                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4685                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4686                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4687                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4688 #endif
4689 #if 0
4690                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4691                             regnode *nnxt = regnext(nxt1);
4692                             if (nnxt == nxt) {
4693                                 if (reg_off_by_arg[OP(nxt1)])
4694                                     ARG_SET(nxt1, nxt2 - nxt1);
4695                                 else if (nxt2 - nxt1 < U16_MAX)
4696                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4697                                 else
4698                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4699                             }
4700                             nxt1 = nnxt;
4701                         }
4702 #endif
4703                         /* Optimize again: */
4704                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4705                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4706                     }
4707                     else
4708                         oscan->flags = 0;
4709                 }
4710                 else if ((OP(oscan) == CURLYX)
4711                          && (flags & SCF_WHILEM_VISITED_POS)
4712                          /* See the comment on a similar expression above.
4713                             However, this time it's not a subexpression
4714                             we care about, but the expression itself. */
4715                          && (maxcount == REG_INFTY)
4716                          && data && ++data->whilem_c < 16) {
4717                     /* This stays as CURLYX, we can put the count/of pair. */
4718                     /* Find WHILEM (as in regexec.c) */
4719                     regnode *nxt = oscan + NEXT_OFF(oscan);
4720
4721                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4722                         nxt += ARG(nxt);
4723                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4724                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4725                 }
4726                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4727                     pars++;
4728                 if (flags & SCF_DO_SUBSTR) {
4729                     SV *last_str = NULL;
4730                     STRLEN last_chrs = 0;
4731                     int counted = mincount != 0;
4732
4733                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4734                                                                   string. */
4735                         SSize_t b = pos_before >= data->last_start_min
4736                             ? pos_before : data->last_start_min;
4737                         STRLEN l;
4738                         const char * const s = SvPV_const(data->last_found, l);
4739                         SSize_t old = b - data->last_start_min;
4740
4741                         if (UTF)
4742                             old = utf8_hop((U8*)s, old) - (U8*)s;
4743                         l -= old;
4744                         /* Get the added string: */
4745                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4746                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4747                                             (U8*)(s + old + l)) : l;
4748                         if (deltanext == 0 && pos_before == b) {
4749                             /* What was added is a constant string */
4750                             if (mincount > 1) {
4751
4752                                 SvGROW(last_str, (mincount * l) + 1);
4753                                 repeatcpy(SvPVX(last_str) + l,
4754                                           SvPVX_const(last_str), l,
4755                                           mincount - 1);
4756                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4757                                 /* Add additional parts. */
4758                                 SvCUR_set(data->last_found,
4759                                           SvCUR(data->last_found) - l);
4760                                 sv_catsv(data->last_found, last_str);
4761                                 {
4762                                     SV * sv = data->last_found;
4763                                     MAGIC *mg =
4764                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4765                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4766                                     if (mg && mg->mg_len >= 0)
4767                                         mg->mg_len += last_chrs * (mincount-1);
4768                                 }
4769                                 last_chrs *= mincount;
4770                                 data->last_end += l * (mincount - 1);
4771                             }
4772                         } else {
4773                             /* start offset must point into the last copy */
4774                             data->last_start_min += minnext * (mincount - 1);
4775                             data->last_start_max += is_inf ? SSize_t_MAX
4776                                 : (maxcount - 1) * (minnext + data->pos_delta);
4777                         }
4778                     }
4779                     /* It is counted once already... */
4780                     data->pos_min += minnext * (mincount - counted);
4781 #if 0
4782 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4783                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4784                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4785     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4786     (UV)mincount);
4787 if (deltanext != SSize_t_MAX)
4788 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4789     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4790           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4791 #endif
4792                     if (deltanext == SSize_t_MAX
4793                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4794                         data->pos_delta = SSize_t_MAX;
4795                     else
4796                         data->pos_delta += - counted * deltanext +
4797                         (minnext + deltanext) * maxcount - minnext * mincount;
4798                     if (mincount != maxcount) {
4799                          /* Cannot extend fixed substrings found inside
4800                             the group.  */
4801                         scan_commit(pRExC_state, data, minlenp, is_inf);
4802                         if (mincount && last_str) {
4803                             SV * const sv = data->last_found;
4804                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4805                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4806
4807                             if (mg)
4808                                 mg->mg_len = -1;
4809                             sv_setsv(sv, last_str);
4810                             data->last_end = data->pos_min;
4811                             data->last_start_min = data->pos_min - last_chrs;
4812                             data->last_start_max = is_inf
4813                                 ? SSize_t_MAX
4814                                 : data->pos_min + data->pos_delta - last_chrs;
4815                         }
4816                         data->longest = &(data->longest_float);
4817                     }
4818                     SvREFCNT_dec(last_str);
4819                 }
4820                 if (data && (fl & SF_HAS_EVAL))
4821                     data->flags |= SF_HAS_EVAL;
4822               optimize_curly_tail:
4823                 if (OP(oscan) != CURLYX) {
4824                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4825                            && NEXT_OFF(next))
4826                         NEXT_OFF(oscan) += NEXT_OFF(next);
4827                 }
4828                 continue;
4829
4830             default:
4831 #ifdef DEBUGGING
4832                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4833                                                                     OP(scan));
4834 #endif
4835             case REF:
4836             case CLUMP:
4837                 if (flags & SCF_DO_SUBSTR) {
4838                     /* Cannot expect anything... */
4839                     scan_commit(pRExC_state, data, minlenp, is_inf);
4840                     data->longest = &(data->longest_float);
4841                 }
4842                 is_inf = is_inf_internal = 1;
4843                 if (flags & SCF_DO_STCLASS_OR) {
4844                     if (OP(scan) == CLUMP) {
4845                         /* Actually is any start char, but very few code points
4846                          * aren't start characters */
4847                         ssc_match_all_cp(data->start_class);
4848                     }
4849                     else {
4850                         ssc_anything(data->start_class);
4851                     }
4852                 }
4853                 flags &= ~SCF_DO_STCLASS;
4854                 break;
4855             }
4856         }
4857         else if (OP(scan) == LNBREAK) {
4858             if (flags & SCF_DO_STCLASS) {
4859                 if (flags & SCF_DO_STCLASS_AND) {
4860                     ssc_intersection(data->start_class,
4861                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4862                     ssc_clear_locale(data->start_class);
4863                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4864                 }
4865                 else if (flags & SCF_DO_STCLASS_OR) {
4866                     ssc_union(data->start_class,
4867                               PL_XPosix_ptrs[_CC_VERTSPACE],
4868                               FALSE);
4869                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4870
4871                     /* See commit msg for
4872                      * 749e076fceedeb708a624933726e7989f2302f6a */
4873                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4874                 }
4875                 flags &= ~SCF_DO_STCLASS;
4876             }
4877             min++;
4878             delta++;    /* Because of the 2 char string cr-lf */
4879             if (flags & SCF_DO_SUBSTR) {
4880                 /* Cannot expect anything... */
4881                 scan_commit(pRExC_state, data, minlenp, is_inf);
4882                 data->pos_min += 1;
4883                 data->pos_delta += 1;
4884                 data->longest = &(data->longest_float);
4885             }
4886         }
4887         else if (REGNODE_SIMPLE(OP(scan))) {
4888
4889             if (flags & SCF_DO_SUBSTR) {
4890                 scan_commit(pRExC_state, data, minlenp, is_inf);
4891                 data->pos_min++;
4892             }
4893             min++;
4894             if (flags & SCF_DO_STCLASS) {
4895                 bool invert = 0;
4896                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4897                 U8 namedclass;
4898
4899                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4900                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4901
4902                 /* Some of the logic below assumes that switching
4903                    locale on will only add false positives. */
4904                 switch (OP(scan)) {
4905
4906                 default:
4907 #ifdef DEBUGGING
4908                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4909                                                                      OP(scan));
4910 #endif
4911                 case CANY:
4912                 case SANY:
4913                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4914                         ssc_match_all_cp(data->start_class);
4915                     break;
4916
4917                 case REG_ANY:
4918                     {
4919                         SV* REG_ANY_invlist = _new_invlist(2);
4920                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4921                                                             '\n');
4922                         if (flags & SCF_DO_STCLASS_OR) {
4923                             ssc_union(data->start_class,
4924                                       REG_ANY_invlist,
4925                                       TRUE /* TRUE => invert, hence all but \n
4926                                             */
4927                                       );
4928                         }
4929                         else if (flags & SCF_DO_STCLASS_AND) {
4930                             ssc_intersection(data->start_class,
4931                                              REG_ANY_invlist,
4932                                              TRUE  /* TRUE => invert */
4933                                              );
4934                             ssc_clear_locale(data->start_class);
4935                         }
4936                         SvREFCNT_dec_NN(REG_ANY_invlist);
4937                     }
4938                     break;
4939
4940                 case ANYOF:
4941                     if (flags & SCF_DO_STCLASS_AND)
4942                         ssc_and(pRExC_state, data->start_class,
4943                                 (regnode_charclass *) scan);
4944                     else
4945                         ssc_or(pRExC_state, data->start_class,
4946                                                           (regnode_charclass *) scan);
4947                     break;
4948
4949                 case NPOSIXL:
4950                     invert = 1;
4951                     /* FALLTHROUGH */
4952
4953                 case POSIXL:
4954                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4955                     if (flags & SCF_DO_STCLASS_AND) {
4956                         bool was_there = cBOOL(
4957                                           ANYOF_POSIXL_TEST(data->start_class,
4958                                                                  namedclass));
4959                         ANYOF_POSIXL_ZERO(data->start_class);
4960                         if (was_there) {    /* Do an AND */
4961                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4962                         }
4963                         /* No individual code points can now match */
4964                         data->start_class->invlist
4965                                                 = sv_2mortal(_new_invlist(0));
4966                     }
4967                     else {
4968                         int complement = namedclass + ((invert) ? -1 : 1);
4969
4970                         assert(flags & SCF_DO_STCLASS_OR);
4971
4972                         /* If the complement of this class was already there,
4973                          * the result is that they match all code points,
4974                          * (\d + \D == everything).  Remove the classes from
4975                          * future consideration.  Locale is not relevant in
4976                          * this case */
4977                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4978                             ssc_match_all_cp(data->start_class);
4979                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4980                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4981                         }
4982                         else {  /* The usual case; just add this class to the
4983                                    existing set */
4984                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4985                         }
4986                     }
4987                     break;
4988
4989                 case NPOSIXA:   /* For these, we always know the exact set of
4990                                    what's matched */
4991                     invert = 1;
4992                     /* FALLTHROUGH */
4993                 case POSIXA:
4994                     if (FLAGS(scan) == _CC_ASCII) {
4995                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4996                     }
4997                     else {
4998                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4999                                               PL_XPosix_ptrs[_CC_ASCII],
5000                                               &my_invlist);
5001                     }
5002                     goto join_posix;
5003
5004                 case NPOSIXD:
5005                 case NPOSIXU:
5006                     invert = 1;
5007                     /* FALLTHROUGH */
5008                 case POSIXD:
5009                 case POSIXU:
5010                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5011
5012                     /* NPOSIXD matches all upper Latin1 code points unless the
5013                      * target string being matched is UTF-8, which is
5014                      * unknowable until match time.  Since we are going to
5015                      * invert, we want to get rid of all of them so that the
5016                      * inversion will match all */
5017                     if (OP(scan) == NPOSIXD) {
5018                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5019                                           &my_invlist);
5020                     }
5021
5022                   join_posix:
5023
5024                     if (flags & SCF_DO_STCLASS_AND) {
5025                         ssc_intersection(data->start_class, my_invlist, invert);
5026                         ssc_clear_locale(data->start_class);
5027                     }
5028                     else {
5029                         assert(flags & SCF_DO_STCLASS_OR);
5030                         ssc_union(data->start_class, my_invlist, invert);
5031                     }
5032                 }
5033                 if (flags & SCF_DO_STCLASS_OR)
5034                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5035                 flags &= ~SCF_DO_STCLASS;
5036             }
5037         }
5038         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5039             data->flags |= (OP(scan) == MEOL
5040                             ? SF_BEFORE_MEOL
5041                             : SF_BEFORE_SEOL);
5042             scan_commit(pRExC_state, data, minlenp, is_inf);
5043
5044         }
5045         else if (  PL_regkind[OP(scan)] == BRANCHJ
5046                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5047                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5048                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5049             if ( OP(scan) == UNLESSM &&
5050                  scan->flags == 0 &&
5051                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5052                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5053             ) {
5054                 regnode *opt;
5055                 regnode *upto= regnext(scan);
5056                 DEBUG_PARSE_r({
5057                     SV * const mysv_val=sv_newmortal();
5058                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5059
5060                     /*DEBUG_PARSE_MSG("opfail");*/
5061                     regprop(RExC_rx, mysv_val, upto, NULL);
5062                     PerlIO_printf(Perl_debug_log,
5063                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5064                         SvPV_nolen_const(mysv_val),
5065                         (IV)REG_NODE_NUM(upto),
5066                         (IV)(upto - scan)
5067                     );
5068                 });
5069                 OP(scan) = OPFAIL;
5070                 NEXT_OFF(scan) = upto - scan;
5071                 for (opt= scan + 1; opt < upto ; opt++)
5072                     OP(opt) = OPTIMIZED;
5073                 scan= upto;
5074                 continue;
5075             }
5076             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5077                 || OP(scan) == UNLESSM )
5078             {
5079                 /* Negative Lookahead/lookbehind
5080                    In this case we can't do fixed string optimisation.
5081                 */
5082
5083                 SSize_t deltanext, minnext, fake = 0;
5084                 regnode *nscan;
5085                 regnode_ssc intrnl;
5086                 int f = 0;
5087
5088                 data_fake.flags = 0;
5089                 if (data) {
5090                     data_fake.whilem_c = data->whilem_c;
5091                     data_fake.last_closep = data->last_closep;
5092                 }
5093                 else
5094                     data_fake.last_closep = &fake;
5095                 data_fake.pos_delta = delta;
5096                 if ( flags & SCF_DO_STCLASS && !scan->flags
5097                      && OP(scan) == IFMATCH ) { /* Lookahead */
5098                     ssc_init(pRExC_state, &intrnl);
5099                     data_fake.start_class = &intrnl;
5100                     f |= SCF_DO_STCLASS_AND;
5101                 }
5102                 if (flags & SCF_WHILEM_VISITED_POS)
5103                     f |= SCF_WHILEM_VISITED_POS;
5104                 next = regnext(scan);
5105                 nscan = NEXTOPER(NEXTOPER(scan));
5106                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5107                                       last, &data_fake, stopparen,
5108                                       recursed_depth, NULL, f, depth+1);
5109                 if (scan->flags) {
5110                     if (deltanext) {
5111                         FAIL("Variable length lookbehind not implemented");
5112                     }
5113                     else if (minnext > (I32)U8_MAX) {
5114                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5115                               (UV)U8_MAX);
5116                     }
5117                     scan->flags = (U8)minnext;
5118                 }
5119                 if (data) {
5120                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5121                         pars++;
5122                     if (data_fake.flags & SF_HAS_EVAL)
5123                         data->flags |= SF_HAS_EVAL;
5124                     data->whilem_c = data_fake.whilem_c;
5125                 }
5126                 if (f & SCF_DO_STCLASS_AND) {
5127                     if (flags & SCF_DO_STCLASS_OR) {
5128                         /* OR before, AND after: ideally we would recurse with
5129                          * data_fake to get the AND applied by study of the
5130                          * remainder of the pattern, and then derecurse;
5131                          * *** HACK *** for now just treat as "no information".
5132                          * See [perl #56690].
5133                          */
5134                         ssc_init(pRExC_state, data->start_class);
5135                     }  else {
5136                         /* AND before and after: combine and continue */
5137                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5138                     }
5139                 }
5140             }
5141 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5142             else {
5143                 /* Positive Lookahead/lookbehind
5144                    In this case we can do fixed string optimisation,
5145                    but we must be careful about it. Note in the case of
5146                    lookbehind the positions will be offset by the minimum
5147                    length of the pattern, something we won't know about
5148                    until after the recurse.
5149                 */
5150                 SSize_t deltanext, fake = 0;
5151                 regnode *nscan;
5152                 regnode_ssc intrnl;
5153                 int f = 0;
5154                 /* We use SAVEFREEPV so that when the full compile
5155                     is finished perl will clean up the allocated
5156                     minlens when it's all done. This way we don't
5157                     have to worry about freeing them when we know
5158                     they wont be used, which would be a pain.
5159                  */
5160                 SSize_t *minnextp;
5161                 Newx( minnextp, 1, SSize_t );
5162                 SAVEFREEPV(minnextp);
5163
5164                 if (data) {
5165                     StructCopy(data, &data_fake, scan_data_t);
5166                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5167                         f |= SCF_DO_SUBSTR;
5168                         if (scan->flags)
5169                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5170                         data_fake.last_found=newSVsv(data->last_found);
5171                     }
5172                 }
5173                 else
5174                     data_fake.last_closep = &fake;
5175                 data_fake.flags = 0;
5176                 data_fake.pos_delta = delta;
5177                 if (is_inf)
5178                     data_fake.flags |= SF_IS_INF;
5179                 if ( flags & SCF_DO_STCLASS && !scan->flags
5180                      && OP(scan) == IFMATCH ) { /* Lookahead */
5181                     ssc_init(pRExC_state, &intrnl);
5182                     data_fake.start_class = &intrnl;
5183                     f |= SCF_DO_STCLASS_AND;
5184                 }
5185                 if (flags & SCF_WHILEM_VISITED_POS)
5186                     f |= SCF_WHILEM_VISITED_POS;
5187                 next = regnext(scan);
5188                 nscan = NEXTOPER(NEXTOPER(scan));
5189
5190                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5191                                         &deltanext, last, &data_fake,
5192                                         stopparen, recursed_depth, NULL,
5193                                         f,depth+1);
5194                 if (scan->flags) {
5195                     if (deltanext) {
5196                         FAIL("Variable length lookbehind not implemented");
5197                     }
5198                     else if (*minnextp > (I32)U8_MAX) {
5199                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5200                               (UV)U8_MAX);
5201                     }
5202                     scan->flags = (U8)*minnextp;
5203                 }
5204
5205                 *minnextp += min;
5206
5207                 if (f & SCF_DO_STCLASS_AND) {
5208                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5209                 }
5210                 if (data) {
5211                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5212                         pars++;
5213                     if (data_fake.flags & SF_HAS_EVAL)
5214                         data->flags |= SF_HAS_EVAL;
5215                     data->whilem_c = data_fake.whilem_c;
5216                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5217                         if (RExC_rx->minlen<*minnextp)
5218                             RExC_rx->minlen=*minnextp;
5219                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5220                         SvREFCNT_dec_NN(data_fake.last_found);
5221
5222                         if ( data_fake.minlen_fixed != minlenp )
5223                         {
5224                             data->offset_fixed= data_fake.offset_fixed;
5225                             data->minlen_fixed= data_fake.minlen_fixed;
5226                             data->lookbehind_fixed+= scan->flags;
5227                         }
5228                         if ( data_fake.minlen_float != minlenp )
5229                         {
5230                             data->minlen_float= data_fake.minlen_float;
5231                             data->offset_float_min=data_fake.offset_float_min;
5232                             data->offset_float_max=data_fake.offset_float_max;
5233                             data->lookbehind_float+= scan->flags;
5234                         }
5235                     }
5236                 }
5237             }
5238 #endif
5239         }
5240         else if (OP(scan) == OPEN) {
5241             if (stopparen != (I32)ARG(scan))
5242                 pars++;
5243         }
5244         else if (OP(scan) == CLOSE) {
5245             if (stopparen == (I32)ARG(scan)) {
5246                 break;
5247             }
5248             if ((I32)ARG(scan) == is_par) {
5249                 next = regnext(scan);
5250
5251                 if ( next && (OP(next) != WHILEM) && next < last)
5252                     is_par = 0;         /* Disable optimization */
5253             }
5254             if (data)
5255                 *(data->last_closep) = ARG(scan);
5256         }
5257         else if (OP(scan) == EVAL) {
5258                 if (data)
5259                     data->flags |= SF_HAS_EVAL;
5260         }
5261         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5262             if (flags & SCF_DO_SUBSTR) {
5263                 scan_commit(pRExC_state, data, minlenp, is_inf);
5264                 flags &= ~SCF_DO_SUBSTR;
5265             }
5266             if (data && OP(scan)==ACCEPT) {
5267                 data->flags |= SCF_SEEN_ACCEPT;
5268                 if (stopmin > min)
5269                     stopmin = min;
5270             }
5271         }
5272         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5273         {
5274                 if (flags & SCF_DO_SUBSTR) {
5275                     scan_commit(pRExC_state, data, minlenp, is_inf);
5276                     data->longest = &(data->longest_float);
5277                 }
5278                 is_inf = is_inf_internal = 1;
5279                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5280                     ssc_anything(data->start_class);
5281                 flags &= ~SCF_DO_STCLASS;
5282         }
5283         else if (OP(scan) == GPOS) {
5284             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5285                 !(delta || is_inf || (data && data->pos_delta)))
5286             {
5287                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5288                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5289                 if (RExC_rx->gofs < (STRLEN)min)
5290                     RExC_rx->gofs = min;
5291             } else {
5292                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5293                 RExC_rx->gofs = 0;
5294             }
5295         }
5296 #ifdef TRIE_STUDY_OPT
5297 #ifdef FULL_TRIE_STUDY
5298         else if (PL_regkind[OP(scan)] == TRIE) {
5299             /* NOTE - There is similar code to this block above for handling
5300                BRANCH nodes on the initial study.  If you change stuff here
5301                check there too. */
5302             regnode *trie_node= scan;
5303             regnode *tail= regnext(scan);
5304             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5305             SSize_t max1 = 0, min1 = SSize_t_MAX;
5306             regnode_ssc accum;
5307
5308             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5309                 /* Cannot merge strings after this. */
5310                 scan_commit(pRExC_state, data, minlenp, is_inf);
5311             }
5312             if (flags & SCF_DO_STCLASS)
5313                 ssc_init_zero(pRExC_state, &accum);
5314
5315             if (!trie->jump) {
5316                 min1= trie->minlen;
5317                 max1= trie->maxlen;
5318             } else {
5319                 const regnode *nextbranch= NULL;
5320                 U32 word;
5321
5322                 for ( word=1 ; word <= trie->wordcount ; word++)
5323                 {
5324                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5325                     regnode_ssc this_class;
5326
5327                     data_fake.flags = 0;
5328                     if (data) {
5329                         data_fake.whilem_c = data->whilem_c;
5330                         data_fake.last_closep = data->last_closep;
5331                     }
5332                     else
5333                         data_fake.last_closep = &fake;
5334                     data_fake.pos_delta = delta;
5335                     if (flags & SCF_DO_STCLASS) {
5336                         ssc_init(pRExC_state, &this_class);
5337                         data_fake.start_class = &this_class;
5338                         f = SCF_DO_STCLASS_AND;
5339                     }
5340                     if (flags & SCF_WHILEM_VISITED_POS)
5341                         f |= SCF_WHILEM_VISITED_POS;
5342
5343                     if (trie->jump[word]) {
5344                         if (!nextbranch)
5345                             nextbranch = trie_node + trie->jump[0];
5346                         scan= trie_node + trie->jump[word];
5347                         /* We go from the jump point to the branch that follows
5348                            it. Note this means we need the vestigal unused
5349                            branches even though they arent otherwise used. */
5350                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5351                             &deltanext, (regnode *)nextbranch, &data_fake,
5352                             stopparen, recursed_depth, NULL, f,depth+1);
5353                     }
5354                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5355                         nextbranch= regnext((regnode*)nextbranch);
5356
5357                     if (min1 > (SSize_t)(minnext + trie->minlen))
5358                         min1 = minnext + trie->minlen;
5359                     if (deltanext == SSize_t_MAX) {
5360                         is_inf = is_inf_internal = 1;
5361                         max1 = SSize_t_MAX;
5362                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5363                         max1 = minnext + deltanext + trie->maxlen;
5364
5365                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5366                         pars++;
5367                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5368                         if ( stopmin > min + min1)
5369                             stopmin = min + min1;
5370                         flags &= ~SCF_DO_SUBSTR;
5371                         if (data)
5372                             data->flags |= SCF_SEEN_ACCEPT;
5373                     }
5374                     if (data) {
5375                         if (data_fake.flags & SF_HAS_EVAL)
5376                             data->flags |= SF_HAS_EVAL;
5377                         data->whilem_c = data_fake.whilem_c;
5378                     }
5379                     if (flags & SCF_DO_STCLASS)
5380                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5381                 }
5382             }
5383             if (flags & SCF_DO_SUBSTR) {
5384                 data->pos_min += min1;
5385                 data->pos_delta += max1 - min1;
5386                 if (max1 != min1 || is_inf)
5387                     data->longest = &(data->longest_float);
5388             }
5389             min += min1;
5390             delta += max1 - min1;
5391             if (flags & SCF_DO_STCLASS_OR) {
5392                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5393                 if (min1) {
5394                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5395                     flags &= ~SCF_DO_STCLASS;
5396                 }
5397             }
5398             else if (flags & SCF_DO_STCLASS_AND) {
5399                 if (min1) {
5400                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5401                     flags &= ~SCF_DO_STCLASS;
5402                 }
5403                 else {
5404                     /* Switch to OR mode: cache the old value of
5405                      * data->start_class */
5406                     INIT_AND_WITHP;
5407                     StructCopy(data->start_class, and_withp, regnode_ssc);
5408                     flags &= ~SCF_DO_STCLASS_AND;
5409                     StructCopy(&accum, data->start_class, regnode_ssc);
5410                     flags |= SCF_DO_STCLASS_OR;
5411                 }
5412             }
5413             scan= tail;
5414             continue;
5415         }
5416 #else
5417         else if (PL_regkind[OP(scan)] == TRIE) {
5418             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5419             U8*bang=NULL;
5420
5421             min += trie->minlen;
5422             delta += (trie->maxlen - trie->minlen);
5423             flags &= ~SCF_DO_STCLASS; /* xxx */
5424             if (flags & SCF_DO_SUBSTR) {
5425                 /* Cannot expect anything... */
5426                 scan_commit(pRExC_state, data, minlenp, is_inf);
5427                 data->pos_min += trie->minlen;
5428                 data->pos_delta += (trie->maxlen - trie->minlen);
5429                 if (trie->maxlen != trie->minlen)
5430                     data->longest = &(data->longest_float);
5431             }
5432             if (trie->jump) /* no more substrings -- for now /grr*/
5433                flags &= ~SCF_DO_SUBSTR;
5434         }
5435 #endif /* old or new */
5436 #endif /* TRIE_STUDY_OPT */
5437
5438         /* Else: zero-length, ignore. */
5439         scan = regnext(scan);
5440     }
5441     /* If we are exiting a recursion we can unset its recursed bit
5442      * and allow ourselves to enter it again - no danger of an
5443      * infinite loop there.
5444     if (stopparen > -1 && recursed) {
5445         DEBUG_STUDYDATA("unset:", data,depth);
5446         PAREN_UNSET( recursed, stopparen);
5447     }
5448     */
5449     if (frame) {
5450         DEBUG_STUDYDATA("frame-end:",data,depth);
5451         DEBUG_PEEP("fend", scan, depth);
5452         /* restore previous context */
5453         last = frame->last;
5454         scan = frame->next;
5455         stopparen = frame->stop;
5456         recursed_depth = frame->prev_recursed_depth;
5457         depth = depth - 1;
5458
5459         frame = frame->prev;
5460         goto fake_study_recurse;
5461     }
5462
5463   finish:
5464     assert(!frame);
5465     DEBUG_STUDYDATA("pre-fin:",data,depth);
5466
5467     *scanp = scan;
5468     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5469
5470     if (flags & SCF_DO_SUBSTR && is_inf)
5471         data->pos_delta = SSize_t_MAX - data->pos_min;
5472     if (is_par > (I32)U8_MAX)
5473         is_par = 0;
5474     if (is_par && pars==1 && data) {
5475         data->flags |= SF_IN_PAR;
5476         data->flags &= ~SF_HAS_PAR;
5477     }
5478     else if (pars && data) {
5479         data->flags |= SF_HAS_PAR;
5480         data->flags &= ~SF_IN_PAR;
5481     }
5482     if (flags & SCF_DO_STCLASS_OR)
5483         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5484     if (flags & SCF_TRIE_RESTUDY)
5485         data->flags |=  SCF_TRIE_RESTUDY;
5486
5487     DEBUG_STUDYDATA("post-fin:",data,depth);
5488
5489     {
5490         SSize_t final_minlen= min < stopmin ? min : stopmin;
5491
5492         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5493             RExC_maxlen = final_minlen + delta;
5494         }
5495         return final_minlen;
5496     }
5497     /* not-reached */
5498 }
5499
5500 STATIC U32
5501 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5502 {
5503     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5504
5505     PERL_ARGS_ASSERT_ADD_DATA;
5506
5507     Renewc(RExC_rxi->data,
5508            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5509            char, struct reg_data);
5510     if(count)
5511         Renew(RExC_rxi->data->what, count + n, U8);
5512     else
5513         Newx(RExC_rxi->data->what, n, U8);
5514     RExC_rxi->data->count = count + n;
5515     Copy(s, RExC_rxi->data->what + count, n, U8);
5516     return count;
5517 }
5518
5519 /*XXX: todo make this not included in a non debugging perl, but appears to be
5520  * used anyway there, in 'use re' */
5521 #ifndef PERL_IN_XSUB_RE
5522 void
5523 Perl_reginitcolors(pTHX)
5524 {
5525     dVAR;
5526     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5527     if (s) {
5528         char *t = savepv(s);
5529         int i = 0;
5530         PL_colors[0] = t;
5531         while (++i < 6) {
5532             t = strchr(t, '\t');
5533             if (t) {
5534                 *t = '\0';
5535                 PL_colors[i] = ++t;
5536             }
5537             else
5538                 PL_colors[i] = t = (char *)"";
5539         }
5540     } else {
5541         int i = 0;
5542         while (i < 6)
5543             PL_colors[i++] = (char *)"";
5544     }
5545     PL_colorset = 1;
5546 }
5547 #endif
5548
5549
5550 #ifdef TRIE_STUDY_OPT
5551 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5552     STMT_START {                                            \
5553         if (                                                \
5554               (data.flags & SCF_TRIE_RESTUDY)               \
5555               && ! restudied++                              \
5556         ) {                                                 \
5557             dOsomething;                                    \
5558             goto reStudy;                                   \
5559         }                                                   \
5560     } STMT_END
5561 #else
5562 #define CHECK_RESTUDY_GOTO_butfirst
5563 #endif
5564
5565 /*
5566  * pregcomp - compile a regular expression into internal code
5567  *
5568  * Decides which engine's compiler to call based on the hint currently in
5569  * scope
5570  */
5571
5572 #ifndef PERL_IN_XSUB_RE
5573
5574 /* return the currently in-scope regex engine (or the default if none)  */
5575
5576 regexp_engine const *
5577 Perl_current_re_engine(pTHX)
5578 {
5579     dVAR;
5580
5581     if (IN_PERL_COMPILETIME) {
5582         HV * const table = GvHV(PL_hintgv);
5583         SV **ptr;
5584
5585         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5586             return &PL_core_reg_engine;
5587         ptr = hv_fetchs(table, "regcomp", FALSE);
5588         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5589             return &PL_core_reg_engine;
5590         return INT2PTR(regexp_engine*,SvIV(*ptr));
5591     }
5592     else {
5593         SV *ptr;
5594         if (!PL_curcop->cop_hints_hash)
5595             return &PL_core_reg_engine;
5596         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5597         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5598             return &PL_core_reg_engine;
5599         return INT2PTR(regexp_engine*,SvIV(ptr));
5600     }
5601 }
5602
5603
5604 REGEXP *
5605 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5606 {
5607     dVAR;
5608     regexp_engine const *eng = current_re_engine();
5609     GET_RE_DEBUG_FLAGS_DECL;
5610
5611     PERL_ARGS_ASSERT_PREGCOMP;
5612
5613     /* Dispatch a request to compile a regexp to correct regexp engine. */
5614     DEBUG_COMPILE_r({
5615         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5616                         PTR2UV(eng));
5617     });
5618     return CALLREGCOMP_ENG(eng, pattern, flags);
5619 }
5620 #endif
5621
5622 /* public(ish) entry point for the perl core's own regex compiling code.
5623  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5624  * pattern rather than a list of OPs, and uses the internal engine rather
5625  * than the current one */
5626
5627 REGEXP *
5628 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5629 {
5630     SV *pat = pattern; /* defeat constness! */
5631     PERL_ARGS_ASSERT_RE_COMPILE;
5632     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5633 #ifdef PERL_IN_XSUB_RE
5634                                 &my_reg_engine,
5635 #else
5636                                 &PL_core_reg_engine,
5637 #endif
5638                                 NULL, NULL, rx_flags, 0);
5639 }
5640
5641
5642 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5643  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5644  * point to the realloced string and length.
5645  *
5646  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5647  * stuff added */
5648
5649 static void
5650 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5651                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5652 {
5653     U8 *const src = (U8*)*pat_p;
5654     U8 *dst;
5655     int n=0;
5656     STRLEN s = 0, d = 0;
5657     bool do_end = 0;
5658     GET_RE_DEBUG_FLAGS_DECL;
5659
5660     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5661         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5662
5663     Newx(dst, *plen_p * 2 + 1, U8);
5664
5665     while (s < *plen_p) {
5666         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5667             dst[d]   = src[s];
5668         else {
5669             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5670             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5671         }
5672         if (n < num_code_blocks) {
5673             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5674                 pRExC_state->code_blocks[n].start = d;
5675                 assert(dst[d] == '(');
5676                 do_end = 1;
5677             }
5678             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5679                 pRExC_state->code_blocks[n].end = d;
5680                 assert(dst[d] == ')');
5681                 do_end = 0;
5682                 n++;
5683             }
5684         }
5685         s++;
5686         d++;
5687     }
5688     dst[d] = '\0';
5689     *plen_p = d;
5690     *pat_p = (char*) dst;
5691     SAVEFREEPV(*pat_p);
5692     RExC_orig_utf8 = RExC_utf8 = 1;
5693 }
5694
5695
5696
5697 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5698  * while recording any code block indices, and handling overloading,
5699  * nested qr// objects etc.  If pat is null, it will allocate a new
5700  * string, or just return the first arg, if there's only one.
5701  *
5702  * Returns the malloced/updated pat.
5703  * patternp and pat_count is the array of SVs to be concatted;
5704  * oplist is the optional list of ops that generated the SVs;
5705  * recompile_p is a pointer to a boolean that will be set if
5706  *   the regex will need to be recompiled.
5707  * delim, if non-null is an SV that will be inserted between each element
5708  */
5709
5710 static SV*
5711 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5712                 SV *pat, SV ** const patternp, int pat_count,
5713                 OP *oplist, bool *recompile_p, SV *delim)
5714 {
5715     SV **svp;
5716     int n = 0;
5717     bool use_delim = FALSE;
5718     bool alloced = FALSE;
5719
5720     /* if we know we have at least two args, create an empty string,
5721      * then concatenate args to that. For no args, return an empty string */
5722     if (!pat && pat_count != 1) {
5723         pat = newSVpvn("", 0);
5724         SAVEFREESV(pat);
5725         alloced = TRUE;
5726     }
5727
5728     for (svp = patternp; svp < patternp + pat_count; svp++) {
5729         SV *sv;
5730         SV *rx  = NULL;
5731         STRLEN orig_patlen = 0;
5732         bool code = 0;
5733         SV *msv = use_delim ? delim : *svp;
5734         if (!msv) msv = &PL_sv_undef;
5735
5736         /* if we've got a delimiter, we go round the loop twice for each
5737          * svp slot (except the last), using the delimiter the second
5738          * time round */
5739         if (use_delim) {
5740             svp--;
5741             use_delim = FALSE;
5742         }
5743         else if (delim)
5744             use_delim = TRUE;
5745
5746         if (SvTYPE(msv) == SVt_PVAV) {
5747             /* we've encountered an interpolated array within
5748              * the pattern, e.g. /...@a..../. Expand the list of elements,
5749              * then recursively append elements.
5750              * The code in this block is based on S_pushav() */
5751
5752             AV *const av = (AV*)msv;
5753             const SSize_t maxarg = AvFILL(av) + 1;
5754             SV **array;
5755
5756             if (oplist) {
5757                 assert(oplist->op_type == OP_PADAV
5758                     || oplist->op_type == OP_RV2AV);
5759                 oplist = oplist->op_sibling;;
5760             }
5761
5762             if (SvRMAGICAL(av)) {
5763                 SSize_t i;
5764
5765                 Newx(array, maxarg, SV*);
5766                 SAVEFREEPV(array);
5767                 for (i=0; i < maxarg; i++) {
5768                     SV ** const svp = av_fetch(av, i, FALSE);
5769                     array[i] = svp ? *svp : &PL_sv_undef;
5770                 }
5771             }
5772             else
5773                 array = AvARRAY(av);
5774
5775             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5776                                 array, maxarg, NULL, recompile_p,
5777                                 /* $" */
5778                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5779
5780             continue;
5781         }
5782
5783
5784         /* we make the assumption here that each op in the list of
5785          * op_siblings maps to one SV pushed onto the stack,
5786          * except for code blocks, with have both an OP_NULL and
5787          * and OP_CONST.
5788          * This allows us to match up the list of SVs against the
5789          * list of OPs to find the next code block.
5790          *
5791          * Note that       PUSHMARK PADSV PADSV ..
5792          * is optimised to
5793          *                 PADRANGE PADSV  PADSV  ..
5794          * so the alignment still works. */
5795
5796         if (oplist) {
5797             if (oplist->op_type == OP_NULL
5798                 && (oplist->op_flags & OPf_SPECIAL))
5799             {
5800                 assert(n < pRExC_state->num_code_blocks);
5801                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5802                 pRExC_state->code_blocks[n].block = oplist;
5803                 pRExC_state->code_blocks[n].src_regex = NULL;
5804                 n++;
5805                 code = 1;
5806                 oplist = oplist->op_sibling; /* skip CONST */
5807                 assert(oplist);
5808             }
5809             oplist = oplist->op_sibling;;
5810         }
5811
5812         /* apply magic and QR overloading to arg */
5813
5814         SvGETMAGIC(msv);
5815         if (SvROK(msv) && SvAMAGIC(msv)) {
5816             SV *sv = AMG_CALLunary(msv, regexp_amg);
5817             if (sv) {
5818                 if (SvROK(sv))
5819                     sv = SvRV(sv);
5820                 if (SvTYPE(sv) != SVt_REGEXP)
5821                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5822                 msv = sv;
5823             }
5824         }
5825
5826         /* try concatenation overload ... */
5827         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5828                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5829         {
5830             sv_setsv(pat, sv);
5831             /* overloading involved: all bets are off over literal
5832              * code. Pretend we haven't seen it */
5833             pRExC_state->num_code_blocks -= n;
5834             n = 0;
5835         }
5836         else  {
5837             /* ... or failing that, try "" overload */
5838             while (SvAMAGIC(msv)
5839                     && (sv = AMG_CALLunary(msv, string_amg))
5840                     && sv != msv
5841                     &&  !(   SvROK(msv)
5842                           && SvROK(sv)
5843                           && SvRV(msv) == SvRV(sv))
5844             ) {
5845                 msv = sv;
5846                 SvGETMAGIC(msv);
5847             }
5848             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5849                 msv = SvRV(msv);
5850
5851             if (pat) {
5852                 /* this is a partially unrolled
5853                  *     sv_catsv_nomg(pat, msv);
5854                  * that allows us to adjust code block indices if
5855                  * needed */
5856                 STRLEN dlen;
5857                 char *dst = SvPV_force_nomg(pat, dlen);
5858                 orig_patlen = dlen;
5859                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5860                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5861                     sv_setpvn(pat, dst, dlen);
5862                     SvUTF8_on(pat);
5863                 }
5864                 sv_catsv_nomg(pat, msv);
5865                 rx = msv;
5866             }
5867             else
5868                 pat = msv;
5869
5870             if (code)
5871                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5872         }
5873
5874         /* extract any code blocks within any embedded qr//'s */
5875         if (rx && SvTYPE(rx) == SVt_REGEXP
5876             && RX_ENGINE((REGEXP*)rx)->op_comp)
5877         {
5878
5879             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5880             if (ri->num_code_blocks) {
5881                 int i;
5882                 /* the presence of an embedded qr// with code means
5883                  * we should always recompile: the text of the
5884                  * qr// may not have changed, but it may be a
5885                  * different closure than last time */
5886                 *recompile_p = 1;
5887                 Renew(pRExC_state->code_blocks,
5888                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5889                     struct reg_code_block);
5890                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5891
5892                 for (i=0; i < ri->num_code_blocks; i++) {
5893                     struct reg_code_block *src, *dst;
5894                     STRLEN offset =  orig_patlen
5895                         + ReANY((REGEXP *)rx)->pre_prefix;
5896                     assert(n < pRExC_state->num_code_blocks);
5897                     src = &ri->code_blocks[i];
5898                     dst = &pRExC_state->code_blocks[n];
5899                     dst->start      = src->start + offset;
5900                     dst->end        = src->end   + offset;
5901                     dst->block      = src->block;
5902                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5903                                             src->src_regex
5904                                                 ? src->src_regex
5905                                                 : (REGEXP*)rx);
5906                     n++;
5907                 }
5908             }
5909         }
5910     }
5911     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5912     if (alloced)
5913         SvSETMAGIC(pat);
5914
5915     return pat;
5916 }
5917
5918
5919
5920 /* see if there are any run-time code blocks in the pattern.
5921  * False positives are allowed */
5922
5923 static bool
5924 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5925                     char *pat, STRLEN plen)
5926 {
5927     int n = 0;
5928     STRLEN s;
5929
5930     for (s = 0; s < plen; s++) {
5931         if (n < pRExC_state->num_code_blocks
5932             && s == pRExC_state->code_blocks[n].start)
5933         {
5934             s = pRExC_state->code_blocks[n].end;
5935             n++;
5936             continue;
5937         }
5938         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5939          * positives here */
5940         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5941             (pat[s+2] == '{'
5942                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5943         )
5944             return 1;
5945     }
5946     return 0;
5947 }
5948
5949 /* Handle run-time code blocks. We will already have compiled any direct
5950  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5951  * copy of it, but with any literal code blocks blanked out and
5952  * appropriate chars escaped; then feed it into
5953  *
5954  *    eval "qr'modified_pattern'"
5955  *
5956  * For example,
5957  *
5958  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5959  *
5960  * becomes
5961  *
5962  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5963  *
5964  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5965  * and merge them with any code blocks of the original regexp.
5966  *
5967  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5968  * instead, just save the qr and return FALSE; this tells our caller that
5969  * the original pattern needs upgrading to utf8.
5970  */
5971
5972 static bool
5973 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5974     char *pat, STRLEN plen)
5975 {
5976     SV *qr;
5977
5978     GET_RE_DEBUG_FLAGS_DECL;
5979
5980     if (pRExC_state->runtime_code_qr) {
5981         /* this is the second time we've been called; this should
5982          * only happen if the main pattern got upgraded to utf8
5983          * during compilation; re-use the qr we compiled first time
5984          * round (which should be utf8 too)
5985          */
5986         qr = pRExC_state->runtime_code_qr;
5987         pRExC_state->runtime_code_qr = NULL;
5988         assert(RExC_utf8 && SvUTF8(qr));
5989     }
5990     else {
5991         int n = 0;
5992         STRLEN s;
5993         char *p, *newpat;
5994         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5995         SV *sv, *qr_ref;
5996         dSP;
5997
5998         /* determine how many extra chars we need for ' and \ escaping */
5999         for (s = 0; s < plen; s++) {
6000             if (pat[s] == '\'' || pat[s] == '\\')
6001                 newlen++;
6002         }
6003
6004         Newx(newpat, newlen, char);
6005         p = newpat;
6006         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6007
6008         for (s = 0; s < plen; s++) {
6009             if (n < pRExC_state->num_code_blocks
6010                 && s == pRExC_state->code_blocks[n].start)
6011             {
6012                 /* blank out literal code block */
6013                 assert(pat[s] == '(');
6014                 while (s <= pRExC_state->code_blocks[n].end) {
6015                     *p++ = '_';
6016                     s++;
6017                 }
6018                 s--;
6019                 n++;
6020                 continue;
6021             }
6022             if (pat[s] == '\'' || pat[s] == '\\')
6023                 *p++ = '\\';
6024             *p++ = pat[s];
6025         }
6026         *p++ = '\'';
6027         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6028             *p++ = 'x';
6029         *p++ = '\0';
6030         DEBUG_COMPILE_r({
6031             PerlIO_printf(Perl_debug_log,
6032                 "%sre-parsing pattern for runtime code:%s %s\n",
6033                 PL_colors[4],PL_colors[5],newpat);
6034         });
6035
6036         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6037         Safefree(newpat);
6038
6039         ENTER;
6040         SAVETMPS;
6041         save_re_context();
6042         PUSHSTACKi(PERLSI_REQUIRE);
6043         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6044          * parsing qr''; normally only q'' does this. It also alters
6045          * hints handling */
6046         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6047         SvREFCNT_dec_NN(sv);
6048         SPAGAIN;
6049         qr_ref = POPs;
6050         PUTBACK;
6051         {
6052             SV * const errsv = ERRSV;
6053             if (SvTRUE_NN(errsv))
6054             {
6055                 Safefree(pRExC_state->code_blocks);
6056                 /* use croak_sv ? */
6057                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6058             }
6059         }
6060         assert(SvROK(qr_ref));
6061         qr = SvRV(qr_ref);
6062         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6063         /* the leaving below frees the tmp qr_ref.
6064          * Give qr a life of its own */
6065         SvREFCNT_inc(qr);
6066         POPSTACK;
6067         FREETMPS;
6068         LEAVE;
6069
6070     }
6071
6072     if (!RExC_utf8 && SvUTF8(qr)) {
6073         /* first time through; the pattern got upgraded; save the
6074          * qr for the next time through */
6075         assert(!pRExC_state->runtime_code_qr);
6076         pRExC_state->runtime_code_qr = qr;
6077         return 0;
6078     }
6079
6080
6081     /* extract any code blocks within the returned qr//  */
6082
6083
6084     /* merge the main (r1) and run-time (r2) code blocks into one */
6085     {
6086         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6087         struct reg_code_block *new_block, *dst;
6088         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6089         int i1 = 0, i2 = 0;
6090
6091         if (!r2->num_code_blocks) /* we guessed wrong */
6092         {
6093             SvREFCNT_dec_NN(qr);
6094             return 1;
6095         }
6096
6097         Newx(new_block,
6098             r1->num_code_blocks + r2->num_code_blocks,
6099             struct reg_code_block);
6100         dst = new_block;
6101
6102         while (    i1 < r1->num_code_blocks
6103                 || i2 < r2->num_code_blocks)
6104         {
6105             struct reg_code_block *src;
6106             bool is_qr = 0;
6107
6108             if (i1 == r1->num_code_blocks) {
6109                 src = &r2->code_blocks[i2++];
6110                 is_qr = 1;
6111             }
6112             else if (i2 == r2->num_code_blocks)
6113                 src = &r1->code_blocks[i1++];
6114             else if (  r1->code_blocks[i1].start
6115                      < r2->code_blocks[i2].start)
6116             {
6117                 src = &r1->code_blocks[i1++];
6118                 assert(src->end < r2->code_blocks[i2].start);
6119             }
6120             else {
6121                 assert(  r1->code_blocks[i1].start
6122                        > r2->code_blocks[i2].start);
6123                 src = &r2->code_blocks[i2++];
6124                 is_qr = 1;
6125                 assert(src->end < r1->code_blocks[i1].start);
6126             }
6127
6128             assert(pat[src->start] == '(');
6129             assert(pat[src->end]   == ')');
6130             dst->start      = src->start;
6131             dst->end        = src->end;
6132             dst->block      = src->block;
6133             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6134                                     : src->src_regex;
6135             dst++;
6136         }
6137         r1->num_code_blocks += r2->num_code_blocks;
6138         Safefree(r1->code_blocks);
6139         r1->code_blocks = new_block;
6140     }
6141
6142     SvREFCNT_dec_NN(qr);
6143     return 1;
6144 }
6145
6146
6147 STATIC bool
6148 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6149                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6150                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6151                       STRLEN longest_length, bool eol, bool meol)
6152 {
6153     /* This is the common code for setting up the floating and fixed length
6154      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6155      * as to whether succeeded or not */
6156
6157     I32 t;
6158     SSize_t ml;
6159
6160     if (! (longest_length
6161            || (eol /* Can't have SEOL and MULTI */
6162                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6163           )
6164             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6165         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6166     {
6167         return FALSE;
6168     }
6169
6170     /* copy the information about the longest from the reg_scan_data
6171         over to the program. */
6172     if (SvUTF8(sv_longest)) {
6173         *rx_utf8 = sv_longest;
6174         *rx_substr = NULL;
6175     } else {
6176         *rx_substr = sv_longest;
6177         *rx_utf8 = NULL;
6178     }
6179     /* end_shift is how many chars that must be matched that
6180         follow this item. We calculate it ahead of time as once the
6181         lookbehind offset is added in we lose the ability to correctly
6182         calculate it.*/
6183     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6184     *rx_end_shift = ml - offset
6185         - longest_length + (SvTAIL(sv_longest) != 0)
6186         + lookbehind;
6187
6188     t = (eol/* Can't have SEOL and MULTI */
6189          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6190     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6191
6192     return TRUE;
6193 }
6194
6195 /*
6196  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6197  * regular expression into internal code.
6198  * The pattern may be passed either as:
6199  *    a list of SVs (patternp plus pat_count)
6200  *    a list of OPs (expr)
6201  * If both are passed, the SV list is used, but the OP list indicates
6202  * which SVs are actually pre-compiled code blocks
6203  *
6204  * The SVs in the list have magic and qr overloading applied to them (and
6205  * the list may be modified in-place with replacement SVs in the latter
6206  * case).
6207  *
6208  * If the pattern hasn't changed from old_re, then old_re will be
6209  * returned.
6210  *
6211  * eng is the current engine. If that engine has an op_comp method, then
6212  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6213  * do the initial concatenation of arguments and pass on to the external
6214  * engine.
6215  *
6216  * If is_bare_re is not null, set it to a boolean indicating whether the
6217  * arg list reduced (after overloading) to a single bare regex which has
6218  * been returned (i.e. /$qr/).
6219  *
6220  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6221  *
6222  * pm_flags contains the PMf_* flags, typically based on those from the
6223  * pm_flags field of the related PMOP. Currently we're only interested in
6224  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6225  *
6226  * We can't allocate space until we know how big the compiled form will be,
6227  * but we can't compile it (and thus know how big it is) until we've got a
6228  * place to put the code.  So we cheat:  we compile it twice, once with code
6229  * generation turned off and size counting turned on, and once "for real".
6230  * This also means that we don't allocate space until we are sure that the
6231  * thing really will compile successfully, and we never have to move the
6232  * code and thus invalidate pointers into it.  (Note that it has to be in
6233  * one piece because free() must be able to free it all.) [NB: not true in perl]
6234  *
6235  * Beware that the optimization-preparation code in here knows about some
6236  * of the structure of the compiled regexp.  [I'll say.]
6237  */
6238
6239 REGEXP *
6240 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6241                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6242                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6243 {
6244     dVAR;
6245     REGEXP *rx;
6246     struct regexp *r;
6247     regexp_internal *ri;
6248     STRLEN plen;
6249     char *exp;
6250     regnode *scan;
6251     I32 flags;
6252     SSize_t minlen = 0;
6253     U32 rx_flags;
6254     SV *pat;
6255     SV *code_blocksv = NULL;
6256     SV** new_patternp = patternp;
6257
6258     /* these are all flags - maybe they should be turned
6259      * into a single int with different bit masks */
6260     I32 sawlookahead = 0;
6261     I32 sawplus = 0;
6262     I32 sawopen = 0;
6263     I32 sawminmod = 0;
6264
6265     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6266     bool recompile = 0;
6267     bool runtime_code = 0;
6268     scan_data_t data;
6269     RExC_state_t RExC_state;
6270     RExC_state_t * const pRExC_state = &RExC_state;
6271 #ifdef TRIE_STUDY_OPT
6272     int restudied = 0;
6273     RExC_state_t copyRExC_state;
6274 #endif
6275     GET_RE_DEBUG_FLAGS_DECL;
6276
6277     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6278
6279     DEBUG_r(if (!PL_colorset) reginitcolors());
6280
6281 #ifndef PERL_IN_XSUB_RE
6282     /* Initialize these here instead of as-needed, as is quick and avoids
6283      * having to test them each time otherwise */
6284     if (! PL_AboveLatin1) {
6285         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6286         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6287         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6288         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6289         PL_HasMultiCharFold =
6290                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6291     }
6292 #endif
6293
6294     pRExC_state->code_blocks = NULL;
6295     pRExC_state->num_code_blocks = 0;
6296
6297     if (is_bare_re)
6298         *is_bare_re = FALSE;
6299
6300     if (expr && (expr->op_type == OP_LIST ||
6301                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6302         /* allocate code_blocks if needed */
6303         OP *o;
6304         int ncode = 0;
6305
6306         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6307             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6308                 ncode++; /* count of DO blocks */
6309         if (ncode) {
6310             pRExC_state->num_code_blocks = ncode;
6311             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6312         }
6313     }
6314
6315     if (!pat_count) {
6316         /* compile-time pattern with just OP_CONSTs and DO blocks */
6317
6318         int n;
6319         OP *o;
6320
6321         /* find how many CONSTs there are */
6322         assert(expr);
6323         n = 0;
6324         if (expr->op_type == OP_CONST)
6325             n = 1;
6326         else
6327             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6328                 if (o->op_type == OP_CONST)
6329                     n++;
6330             }
6331
6332         /* fake up an SV array */
6333
6334         assert(!new_patternp);
6335         Newx(new_patternp, n, SV*);
6336         SAVEFREEPV(new_patternp);
6337         pat_count = n;
6338
6339         n = 0;
6340         if (expr->op_type == OP_CONST)
6341             new_patternp[n] = cSVOPx_sv(expr);
6342         else
6343             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6344                 if (o->op_type == OP_CONST)
6345                     new_patternp[n++] = cSVOPo_sv;
6346             }
6347
6348     }
6349
6350     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6351         "Assembling pattern from %d elements%s\n", pat_count,
6352             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6353
6354     /* set expr to the first arg op */
6355
6356     if (pRExC_state->num_code_blocks
6357          && expr->op_type != OP_CONST)
6358     {
6359             expr = cLISTOPx(expr)->op_first;
6360             assert(   expr->op_type == OP_PUSHMARK
6361                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6362                    || expr->op_type == OP_PADRANGE);
6363             expr = expr->op_sibling;
6364     }
6365
6366     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6367                         expr, &recompile, NULL);
6368
6369     /* handle bare (possibly after overloading) regex: foo =~ $re */
6370     {
6371         SV *re = pat;
6372         if (SvROK(re))
6373             re = SvRV(re);
6374         if (SvTYPE(re) == SVt_REGEXP) {
6375             if (is_bare_re)
6376                 *is_bare_re = TRUE;
6377             SvREFCNT_inc(re);
6378             Safefree(pRExC_state->code_blocks);
6379             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6380                 "Precompiled pattern%s\n",
6381                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6382
6383             return (REGEXP*)re;
6384         }
6385     }
6386
6387     exp = SvPV_nomg(pat, plen);
6388
6389     if (!eng->op_comp) {
6390         if ((SvUTF8(pat) && IN_BYTES)
6391                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6392         {
6393             /* make a temporary copy; either to convert to bytes,
6394              * or to avoid repeating get-magic / overloaded stringify */
6395             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6396                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6397         }
6398         Safefree(pRExC_state->code_blocks);
6399         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6400     }
6401
6402     /* ignore the utf8ness if the pattern is 0 length */
6403     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6404     RExC_uni_semantics = 0;
6405     RExC_contains_locale = 0;
6406     RExC_contains_i = 0;
6407     pRExC_state->runtime_code_qr = NULL;
6408
6409     DEBUG_COMPILE_r({
6410             SV *dsv= sv_newmortal();
6411             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6412             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6413                           PL_colors[4],PL_colors[5],s);
6414         });
6415
6416   redo_first_pass:
6417     /* we jump here if we upgrade the pattern to utf8 and have to
6418      * recompile */
6419
6420     if ((pm_flags & PMf_USE_RE_EVAL)
6421                 /* this second condition covers the non-regex literal case,
6422                  * i.e.  $foo =~ '(?{})'. */
6423                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6424     )
6425         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6426
6427     /* return old regex if pattern hasn't changed */
6428     /* XXX: note in the below we have to check the flags as well as the
6429      * pattern.
6430      *
6431      * Things get a touch tricky as we have to compare the utf8 flag
6432      * independently from the compile flags.  */
6433
6434     if (   old_re
6435         && !recompile
6436         && !!RX_UTF8(old_re) == !!RExC_utf8
6437         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6438         && RX_PRECOMP(old_re)
6439         && RX_PRELEN(old_re) == plen
6440         && memEQ(RX_PRECOMP(old_re), exp, plen)
6441         && !runtime_code /* with runtime code, always recompile */ )
6442     {
6443         Safefree(pRExC_state->code_blocks);
6444         return old_re;
6445     }
6446
6447     rx_flags = orig_rx_flags;
6448
6449     if (rx_flags & PMf_FOLD) {
6450         RExC_contains_i = 1;
6451     }
6452     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6453
6454         /* Set to use unicode semantics if the pattern is in utf8 and has the
6455          * 'depends' charset specified, as it means unicode when utf8  */
6456         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6457     }
6458
6459     RExC_precomp = exp;
6460     RExC_flags = rx_flags;
6461     RExC_pm_flags = pm_flags;
6462
6463     if (runtime_code) {
6464         if (TAINTING_get && TAINT_get)
6465             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6466
6467         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6468             /* whoops, we have a non-utf8 pattern, whilst run-time code
6469              * got compiled as utf8. Try again with a utf8 pattern */
6470             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6471                                     pRExC_state->num_code_blocks);
6472             goto redo_first_pass;
6473         }
6474     }
6475     assert(!pRExC_state->runtime_code_qr);
6476
6477     RExC_sawback = 0;
6478
6479     RExC_seen = 0;
6480     RExC_maxlen = 0;
6481     RExC_in_lookbehind = 0;
6482     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6483     RExC_extralen = 0;
6484     RExC_override_recoding = 0;
6485     RExC_in_multi_char_class = 0;
6486
6487     /* First pass: determine size, legality. */
6488     RExC_parse = exp;
6489     RExC_start = exp;
6490     RExC_end = exp + plen;
6491     RExC_naughty = 0;
6492     RExC_npar = 1;
6493     RExC_nestroot = 0;
6494     RExC_size = 0L;
6495     RExC_emit = (regnode *) &RExC_emit_dummy;
6496     RExC_whilem_seen = 0;
6497     RExC_open_parens = NULL;
6498     RExC_close_parens = NULL;
6499     RExC_opend = NULL;
6500     RExC_paren_names = NULL;
6501 #ifdef DEBUGGING
6502     RExC_paren_name_list = NULL;
6503 #endif
6504     RExC_recurse = NULL;
6505     RExC_study_chunk_recursed = NULL;
6506     RExC_study_chunk_recursed_bytes= 0;
6507     RExC_recurse_count = 0;
6508     pRExC_state->code_index = 0;
6509
6510 #if 0 /* REGC() is (currently) a NOP at the first pass.
6511        * Clever compilers notice this and complain. --jhi */
6512     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6513 #endif
6514     DEBUG_PARSE_r(
6515         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6516         RExC_lastnum=0;
6517         RExC_lastparse=NULL;
6518     );
6519     /* reg may croak on us, not giving us a chance to free
6520        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6521        need it to survive as long as the regexp (qr/(?{})/).
6522        We must check that code_blocksv is not already set, because we may
6523        have jumped back to restart the sizing pass. */
6524     if (pRExC_state->code_blocks && !code_blocksv) {
6525         code_blocksv = newSV_type(SVt_PV);
6526         SAVEFREESV(code_blocksv);
6527         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6528         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6529     }
6530     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6531         /* It's possible to write a regexp in ascii that represents Unicode
6532         codepoints outside of the byte range, such as via \x{100}. If we
6533         detect such a sequence we have to convert the entire pattern to utf8
6534         and then recompile, as our sizing calculation will have been based
6535         on 1 byte == 1 character, but we will need to use utf8 to encode
6536         at least some part of the pattern, and therefore must convert the whole
6537         thing.
6538         -- dmq */
6539         if (flags & RESTART_UTF8) {
6540             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6541                                     pRExC_state->num_code_blocks);
6542             goto redo_first_pass;
6543         }
6544         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6545     }
6546     if (code_blocksv)
6547         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6548
6549     DEBUG_PARSE_r({
6550         PerlIO_printf(Perl_debug_log,
6551             "Required size %"IVdf" nodes\n"
6552             "Starting second pass (creation)\n",
6553             (IV)RExC_size);
6554         RExC_lastnum=0;
6555         RExC_lastparse=NULL;
6556     });
6557
6558     /* The first pass could have found things that force Unicode semantics */
6559     if ((RExC_utf8 || RExC_uni_semantics)
6560          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6561     {
6562         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6563     }
6564
6565     /* Small enough for pointer-storage convention?
6566        If extralen==0, this means that we will not need long jumps. */
6567     if (RExC_size >= 0x10000L && RExC_extralen)
6568         RExC_size += RExC_extralen;
6569     else
6570         RExC_extralen = 0;
6571     if (RExC_whilem_seen > 15)
6572         RExC_whilem_seen = 15;
6573
6574     /* Allocate space and zero-initialize. Note, the two step process
6575        of zeroing when in debug mode, thus anything assigned has to
6576        happen after that */
6577     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6578     r = ReANY(rx);
6579     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6580          char, regexp_internal);
6581     if ( r == NULL || ri == NULL )
6582         FAIL("Regexp out of space");
6583 #ifdef DEBUGGING
6584     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6585     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6586          char);
6587 #else
6588     /* bulk initialize base fields with 0. */
6589     Zero(ri, sizeof(regexp_internal), char);
6590 #endif
6591
6592     /* non-zero initialization begins here */
6593     RXi_SET( r, ri );
6594     r->engine= eng;
6595     r->extflags = rx_flags;
6596     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6597
6598     if (pm_flags & PMf_IS_QR) {
6599         ri->code_blocks = pRExC_state->code_blocks;
6600         ri->num_code_blocks = pRExC_state->num_code_blocks;
6601     }
6602     else
6603     {
6604         int n;
6605         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6606             if (pRExC_state->code_blocks[n].src_regex)
6607                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6608         SAVEFREEPV(pRExC_state->code_blocks);
6609     }
6610
6611     {
6612         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6613         bool has_charset = (get_regex_charset(r->extflags)
6614                                                     != REGEX_DEPENDS_CHARSET);
6615
6616         /* The caret is output if there are any defaults: if not all the STD
6617          * flags are set, or if no character set specifier is needed */
6618         bool has_default =
6619                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6620                     || ! has_charset);
6621         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6622                                                    == REG_RUN_ON_COMMENT_SEEN);
6623         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6624                             >> RXf_PMf_STD_PMMOD_SHIFT);
6625         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6626         char *p;
6627         /* Allocate for the worst case, which is all the std flags are turned
6628          * on.  If more precision is desired, we could do a population count of
6629          * the flags set.  This could be done with a small lookup table, or by
6630          * shifting, masking and adding, or even, when available, assembly
6631          * language for a machine-language population count.
6632          * We never output a minus, as all those are defaults, so are
6633          * covered by the caret */
6634         const STRLEN wraplen = plen + has_p + has_runon
6635             + has_default       /* If needs a caret */
6636
6637                 /* If needs a character set specifier */
6638             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6639             + (sizeof(STD_PAT_MODS) - 1)
6640             + (sizeof("(?:)") - 1);
6641
6642         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6643         r->xpv_len_u.xpvlenu_pv = p;
6644         if (RExC_utf8)
6645             SvFLAGS(rx) |= SVf_UTF8;
6646         *p++='('; *p++='?';
6647
6648         /* If a default, cover it using the caret */
6649         if (has_default) {
6650             *p++= DEFAULT_PAT_MOD;
6651         }
6652         if (has_charset) {
6653             STRLEN len;
6654             const char* const name = get_regex_charset_name(r->extflags, &len);
6655             Copy(name, p, len, char);
6656             p += len;
6657         }
6658         if (has_p)
6659             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6660         {
6661             char ch;
6662             while((ch = *fptr++)) {
6663                 if(reganch & 1)
6664                     *p++ = ch;
6665                 reganch >>= 1;
6666             }
6667         }
6668
6669         *p++ = ':';
6670         Copy(RExC_precomp, p, plen, char);
6671         assert ((RX_WRAPPED(rx) - p) < 16);
6672         r->pre_prefix = p - RX_WRAPPED(rx);
6673         p += plen;
6674         if (has_runon)
6675             *p++ = '\n';
6676         *p++ = ')';
6677         *p = 0;
6678         SvCUR_set(rx, p - RX_WRAPPED(rx));
6679     }
6680
6681     r->intflags = 0;
6682     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6683
6684     /* setup various meta data about recursion, this all requires
6685      * RExC_npar to be correctly set, and a bit later on we clear it */
6686     if (RExC_seen & REG_RECURSE_SEEN) {
6687         Newxz(RExC_open_parens, RExC_npar,regnode *);
6688         SAVEFREEPV(RExC_open_parens);
6689         Newxz(RExC_close_parens,RExC_npar,regnode *);
6690         SAVEFREEPV(RExC_close_parens);
6691     }
6692     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6693         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6694          * So its 1 if there are no parens. */
6695         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6696                                          ((RExC_npar & 0x07) != 0);
6697         Newx(RExC_study_chunk_recursed,
6698              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6699         SAVEFREEPV(RExC_study_chunk_recursed);
6700     }
6701
6702     /* Useful during FAIL. */
6703 #ifdef RE_TRACK_PATTERN_OFFSETS
6704     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6705     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6706                           "%s %"UVuf" bytes for offset annotations.\n",
6707                           ri->u.offsets ? "Got" : "Couldn't get",
6708                           (UV)((2*RExC_size+1) * sizeof(U32))));
6709 #endif
6710     SetProgLen(ri,RExC_size);
6711     RExC_rx_sv = rx;
6712     RExC_rx = r;
6713     RExC_rxi = ri;
6714
6715     /* Second pass: emit code. */
6716     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6717     RExC_pm_flags = pm_flags;
6718     RExC_parse = exp;
6719     RExC_end = exp + plen;
6720     RExC_naughty = 0;
6721     RExC_npar = 1;
6722     RExC_emit_start = ri->program;
6723     RExC_emit = ri->program;
6724     RExC_emit_bound = ri->program + RExC_size + 1;
6725     pRExC_state->code_index = 0;
6726
6727     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6728     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6729         ReREFCNT_dec(rx);
6730         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6731     }
6732     /* XXXX To minimize changes to RE engine we always allocate
6733        3-units-long substrs field. */
6734     Newx(r->substrs, 1, struct reg_substr_data);
6735     if (RExC_recurse_count) {
6736         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6737         SAVEFREEPV(RExC_recurse);
6738     }
6739
6740 reStudy:
6741     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6742     Zero(r->substrs, 1, struct reg_substr_data);
6743     if (RExC_study_chunk_recursed)
6744         Zero(RExC_study_chunk_recursed,
6745              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6746
6747 #ifdef TRIE_STUDY_OPT
6748     if (!restudied) {
6749         StructCopy(&zero_scan_data, &data, scan_data_t);
6750         copyRExC_state = RExC_state;
6751     } else {
6752         U32 seen=RExC_seen;
6753         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6754
6755         RExC_state = copyRExC_state;
6756         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6757             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6758         else
6759             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6760         StructCopy(&zero_scan_data, &data, scan_data_t);
6761     }
6762 #else
6763     StructCopy(&zero_scan_data, &data, scan_data_t);
6764 #endif
6765
6766     /* Dig out information for optimizations. */
6767     r->extflags = RExC_flags; /* was pm_op */
6768     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6769
6770     if (UTF)
6771         SvUTF8_on(rx);  /* Unicode in it? */
6772     ri->regstclass = NULL;
6773     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6774         r->intflags |= PREGf_NAUGHTY;
6775     scan = ri->program + 1;             /* First BRANCH. */
6776
6777     /* testing for BRANCH here tells us whether there is "must appear"
6778        data in the pattern. If there is then we can use it for optimisations */
6779     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6780                                                   */
6781         SSize_t fake;
6782         STRLEN longest_float_length, longest_fixed_length;
6783         regnode_ssc ch_class; /* pointed to by data */
6784         int stclass_flag;
6785         SSize_t last_close = 0; /* pointed to by data */
6786         regnode *first= scan;
6787         regnode *first_next= regnext(first);
6788         /*
6789          * Skip introductions and multiplicators >= 1
6790          * so that we can extract the 'meat' of the pattern that must
6791          * match in the large if() sequence following.
6792          * NOTE that EXACT is NOT covered here, as it is normally
6793          * picked up by the optimiser separately.
6794          *
6795          * This is unfortunate as the optimiser isnt handling lookahead
6796          * properly currently.
6797          *
6798          */
6799         while ((OP(first) == OPEN && (sawopen = 1)) ||
6800                /* An OR of *one* alternative - should not happen now. */
6801             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6802             /* for now we can't handle lookbehind IFMATCH*/
6803             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6804             (OP(first) == PLUS) ||
6805             (OP(first) == MINMOD) ||
6806                /* An {n,m} with n>0 */
6807             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6808             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6809         {
6810                 /*
6811                  * the only op that could be a regnode is PLUS, all the rest
6812                  * will be regnode_1 or regnode_2.
6813                  *
6814                  * (yves doesn't think this is true)
6815                  */
6816                 if (OP(first) == PLUS)
6817                     sawplus = 1;
6818                 else {
6819                     if (OP(first) == MINMOD)
6820                         sawminmod = 1;
6821                     first += regarglen[OP(first)];
6822                 }
6823                 first = NEXTOPER(first);
6824                 first_next= regnext(first);
6825         }
6826
6827         /* Starting-point info. */
6828       again:
6829         DEBUG_PEEP("first:",first,0);
6830         /* Ignore EXACT as we deal with it later. */
6831         if (PL_regkind[OP(first)] == EXACT) {
6832             if (OP(first) == EXACT)
6833                 NOOP;   /* Empty, get anchored substr later. */
6834             else
6835                 ri->regstclass = first;
6836         }
6837 #ifdef TRIE_STCLASS
6838         else if (PL_regkind[OP(first)] == TRIE &&
6839                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6840         {
6841             /* this can happen only on restudy */
6842             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6843         }
6844 #endif
6845         else if (REGNODE_SIMPLE(OP(first)))
6846             ri->regstclass = first;
6847         else if (PL_regkind[OP(first)] == BOUND ||
6848                  PL_regkind[OP(first)] == NBOUND)
6849             ri->regstclass = first;
6850         else if (PL_regkind[OP(first)] == BOL) {
6851             r->intflags |= (OP(first) == MBOL
6852                            ? PREGf_ANCH_MBOL
6853                            : (OP(first) == SBOL
6854                               ? PREGf_ANCH_SBOL
6855                               : PREGf_ANCH_BOL));
6856             first = NEXTOPER(first);
6857             goto again;
6858         }
6859         else if (OP(first) == GPOS) {
6860             r->intflags |= PREGf_ANCH_GPOS;
6861             first = NEXTOPER(first);
6862             goto again;
6863         }
6864         else if ((!sawopen || !RExC_sawback) &&
6865             (OP(first) == STAR &&
6866             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6867             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6868         {
6869             /* turn .* into ^.* with an implied $*=1 */
6870             const int type =
6871                 (OP(NEXTOPER(first)) == REG_ANY)
6872                     ? PREGf_ANCH_MBOL
6873                     : PREGf_ANCH_SBOL;
6874             r->intflags |= (type | PREGf_IMPLICIT);
6875             first = NEXTOPER(first);
6876             goto again;
6877         }
6878         if (sawplus && !sawminmod && !sawlookahead
6879             && (!sawopen || !RExC_sawback)
6880             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6881             /* x+ must match at the 1st pos of run of x's */
6882             r->intflags |= PREGf_SKIP;
6883
6884         /* Scan is after the zeroth branch, first is atomic matcher. */
6885 #ifdef TRIE_STUDY_OPT
6886         DEBUG_PARSE_r(
6887             if (!restudied)
6888                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6889                               (IV)(first - scan + 1))
6890         );
6891 #else
6892         DEBUG_PARSE_r(
6893             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6894                 (IV)(first - scan + 1))
6895         );
6896 #endif
6897
6898
6899         /*
6900         * If there's something expensive in the r.e., find the
6901         * longest literal string that must appear and make it the
6902         * regmust.  Resolve ties in favor of later strings, since
6903         * the regstart check works with the beginning of the r.e.
6904         * and avoiding duplication strengthens checking.  Not a
6905         * strong reason, but sufficient in the absence of others.
6906         * [Now we resolve ties in favor of the earlier string if
6907         * it happens that c_offset_min has been invalidated, since the
6908         * earlier string may buy us something the later one won't.]
6909         */
6910
6911         data.longest_fixed = newSVpvs("");
6912         data.longest_float = newSVpvs("");
6913         data.last_found = newSVpvs("");
6914         data.longest = &(data.longest_fixed);
6915         ENTER_with_name("study_chunk");
6916         SAVEFREESV(data.longest_fixed);
6917         SAVEFREESV(data.longest_float);
6918         SAVEFREESV(data.last_found);
6919         first = scan;
6920         if (!ri->regstclass) {
6921             ssc_init(pRExC_state, &ch_class);
6922             data.start_class = &ch_class;
6923             stclass_flag = SCF_DO_STCLASS_AND;
6924         } else                          /* XXXX Check for BOUND? */
6925             stclass_flag = 0;
6926         data.last_closep = &last_close;
6927
6928         DEBUG_RExC_seen();
6929         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6930                              scan + RExC_size, /* Up to end */
6931             &data, -1, 0, NULL,
6932             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6933                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6934             0);
6935
6936
6937         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6938
6939
6940         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6941              && data.last_start_min == 0 && data.last_end > 0
6942              && !RExC_seen_zerolen
6943              && !(RExC_seen & REG_VERBARG_SEEN)
6944              && !(RExC_seen & REG_GPOS_SEEN)
6945         ){
6946             r->extflags |= RXf_CHECK_ALL;
6947         }
6948         scan_commit(pRExC_state, &data,&minlen,0);
6949
6950         longest_float_length = CHR_SVLEN(data.longest_float);
6951
6952         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6953                    && data.offset_fixed == data.offset_float_min
6954                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6955             && S_setup_longest (aTHX_ pRExC_state,
6956                                     data.longest_float,
6957                                     &(r->float_utf8),
6958                                     &(r->float_substr),
6959                                     &(r->float_end_shift),
6960                                     data.lookbehind_float,
6961                                     data.offset_float_min,
6962                                     data.minlen_float,
6963                                     longest_float_length,
6964                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6965                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6966         {
6967             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6968             r->float_max_offset = data.offset_float_max;
6969             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6970                 r->float_max_offset -= data.lookbehind_float;
6971             SvREFCNT_inc_simple_void_NN(data.longest_float);
6972         }
6973         else {
6974             r->float_substr = r->float_utf8 = NULL;
6975             longest_float_length = 0;
6976         }
6977
6978         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6979
6980         if (S_setup_longest (aTHX_ pRExC_state,
6981                                 data.longest_fixed,
6982                                 &(r->anchored_utf8),
6983                                 &(r->anchored_substr),
6984                                 &(r->anchored_end_shift),
6985                                 data.lookbehind_fixed,
6986                                 data.offset_fixed,
6987                                 data.minlen_fixed,
6988                                 longest_fixed_length,
6989                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6990                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6991         {
6992             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6993             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6994         }
6995         else {
6996             r->anchored_substr = r->anchored_utf8 = NULL;
6997             longest_fixed_length = 0;
6998         }
6999         LEAVE_with_name("study_chunk");
7000
7001         if (ri->regstclass
7002             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7003             ri->regstclass = NULL;
7004
7005         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7006             && stclass_flag
7007             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7008             && !ssc_is_anything(data.start_class))
7009         {
7010             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7011
7012             ssc_finalize(pRExC_state, data.start_class);
7013
7014             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7015             StructCopy(data.start_class,
7016                        (regnode_ssc*)RExC_rxi->data->data[n],
7017                        regnode_ssc);
7018             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7019             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7020             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7021                       regprop(r, sv, (regnode*)data.start_class, NULL);
7022                       PerlIO_printf(Perl_debug_log,
7023                                     "synthetic stclass \"%s\".\n",
7024                                     SvPVX_const(sv));});
7025             data.start_class = NULL;
7026         }
7027
7028         /* A temporary algorithm prefers floated substr to fixed one to dig
7029          * more info. */
7030         if (longest_fixed_length > longest_float_length) {
7031             r->substrs->check_ix = 0;
7032             r->check_end_shift = r->anchored_end_shift;
7033             r->check_substr = r->anchored_substr;
7034             r->check_utf8 = r->anchored_utf8;
7035             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7036             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7037                 r->intflags |= PREGf_NOSCAN;
7038         }
7039         else {
7040             r->substrs->check_ix = 1;
7041             r->check_end_shift = r->float_end_shift;
7042             r->check_substr = r->float_substr;
7043             r->check_utf8 = r->float_utf8;
7044             r->check_offset_min = r->float_min_offset;
7045             r->check_offset_max = r->float_max_offset;
7046         }
7047         if ((r->check_substr || r->check_utf8) ) {
7048             r->extflags |= RXf_USE_INTUIT;
7049             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7050                 r->extflags |= RXf_INTUIT_TAIL;
7051         }
7052         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7053
7054         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7055         if ( (STRLEN)minlen < longest_float_length )
7056             minlen= longest_float_length;
7057         if ( (STRLEN)minlen < longest_fixed_length )
7058             minlen= longest_fixed_length;
7059         */
7060     }
7061     else {
7062         /* Several toplevels. Best we can is to set minlen. */
7063         SSize_t fake;
7064         regnode_ssc ch_class;
7065         SSize_t last_close = 0;
7066
7067         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7068
7069         scan = ri->program + 1;
7070         ssc_init(pRExC_state, &ch_class);
7071         data.start_class = &ch_class;
7072         data.last_closep = &last_close;
7073
7074         DEBUG_RExC_seen();
7075         minlen = study_chunk(pRExC_state,
7076             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7077             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7078                                                       ? SCF_TRIE_DOING_RESTUDY
7079                                                       : 0),
7080             0);
7081
7082         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7083
7084         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7085                 = r->float_substr = r->float_utf8 = NULL;
7086
7087         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7088             && ! ssc_is_anything(data.start_class))
7089         {
7090             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7091
7092             ssc_finalize(pRExC_state, data.start_class);
7093
7094             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7095             StructCopy(data.start_class,
7096                        (regnode_ssc*)RExC_rxi->data->data[n],
7097                        regnode_ssc);
7098             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7099             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7100             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7101                       regprop(r, sv, (regnode*)data.start_class, NULL);
7102                       PerlIO_printf(Perl_debug_log,
7103                                     "synthetic stclass \"%s\".\n",
7104                                     SvPVX_const(sv));});
7105             data.start_class = NULL;
7106         }
7107     }
7108
7109     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7110         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7111         r->maxlen = REG_INFTY;
7112     }
7113     else {
7114         r->maxlen = RExC_maxlen;
7115     }
7116
7117     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7118        the "real" pattern. */
7119     DEBUG_OPTIMISE_r({
7120         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7121                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7122     });
7123     r->minlenret = minlen;
7124     if (r->minlen < minlen)
7125         r->minlen = minlen;
7126
7127     if (RExC_seen & REG_GPOS_SEEN)
7128         r->intflags |= PREGf_GPOS_SEEN;
7129     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7130         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7131                                                 lookbehind */
7132     if (pRExC_state->num_code_blocks)
7133         r->extflags |= RXf_EVAL_SEEN;
7134     if (RExC_seen & REG_CANY_SEEN)
7135         r->intflags |= PREGf_CANY_SEEN;
7136     if (RExC_seen & REG_VERBARG_SEEN)
7137     {
7138         r->intflags |= PREGf_VERBARG_SEEN;
7139         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7140     }
7141     if (RExC_seen & REG_CUTGROUP_SEEN)
7142         r->intflags |= PREGf_CUTGROUP_SEEN;
7143     if (pm_flags & PMf_USE_RE_EVAL)
7144         r->intflags |= PREGf_USE_RE_EVAL;
7145     if (RExC_paren_names)
7146         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7147     else
7148         RXp_PAREN_NAMES(r) = NULL;
7149
7150     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7151      * so it can be used in pp.c */
7152     if (r->intflags & PREGf_ANCH)
7153         r->extflags |= RXf_IS_ANCHORED;
7154
7155
7156     {
7157         /* this is used to identify "special" patterns that might result
7158          * in Perl NOT calling the regex engine and instead doing the match "itself",
7159          * particularly special cases in split//. By having the regex compiler
7160          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7161          * we avoid weird issues with equivalent patterns resulting in different behavior,
7162          * AND we allow non Perl engines to get the same optimizations by the setting the
7163          * flags appropriately - Yves */
7164         regnode *first = ri->program + 1;
7165         U8 fop = OP(first);
7166         regnode *next = NEXTOPER(first);
7167         U8 nop = OP(next);
7168
7169         if (PL_regkind[fop] == NOTHING && nop == END)
7170             r->extflags |= RXf_NULL;
7171         else if (PL_regkind[fop] == BOL && nop == END)
7172             r->extflags |= RXf_START_ONLY;
7173         else if (fop == PLUS
7174                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7175                  && OP(regnext(first)) == END)
7176             r->extflags |= RXf_WHITE;
7177         else if ( r->extflags & RXf_SPLIT
7178                   && fop == EXACT
7179                   && STR_LEN(first) == 1
7180                   && *(STRING(first)) == ' '
7181                   && OP(regnext(first)) == END )
7182             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7183
7184     }
7185
7186     if (RExC_contains_locale) {
7187         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7188     }
7189
7190 #ifdef DEBUGGING
7191     if (RExC_paren_names) {
7192         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7193         ri->data->data[ri->name_list_idx]
7194                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7195     } else
7196 #endif
7197         ri->name_list_idx = 0;
7198
7199     if (RExC_recurse_count) {
7200         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7201             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7202             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7203         }
7204     }
7205     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7206     /* assume we don't need to swap parens around before we match */
7207
7208     DEBUG_DUMP_r({
7209         DEBUG_RExC_seen();
7210         PerlIO_printf(Perl_debug_log,"Final program:\n");
7211         regdump(r);
7212     });
7213 #ifdef RE_TRACK_PATTERN_OFFSETS
7214     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7215         const STRLEN len = ri->u.offsets[0];
7216         STRLEN i;
7217         GET_RE_DEBUG_FLAGS_DECL;
7218         PerlIO_printf(Perl_debug_log,
7219                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7220         for (i = 1; i <= len; i++) {
7221             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7222                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7223                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7224             }
7225         PerlIO_printf(Perl_debug_log, "\n");
7226     });
7227 #endif
7228
7229 #ifdef USE_ITHREADS
7230     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7231      * by setting the regexp SV to readonly-only instead. If the
7232      * pattern's been recompiled, the USEDness should remain. */
7233     if (old_re && SvREADONLY(old_re))
7234         SvREADONLY_on(rx);
7235 #endif
7236     return rx;
7237 }
7238
7239
7240 SV*
7241 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7242                     const U32 flags)
7243 {
7244     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7245
7246     PERL_UNUSED_ARG(value);
7247
7248     if (flags & RXapif_FETCH) {
7249         return reg_named_buff_fetch(rx, key, flags);
7250     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7251         Perl_croak_no_modify();
7252         return NULL;
7253     } else if (flags & RXapif_EXISTS) {
7254         return reg_named_buff_exists(rx, key, flags)
7255             ? &PL_sv_yes
7256             : &PL_sv_no;
7257     } else if (flags & RXapif_REGNAMES) {
7258         return reg_named_buff_all(rx, flags);
7259     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7260         return reg_named_buff_scalar(rx, flags);
7261     } else {
7262         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7263         return NULL;
7264     }
7265 }
7266
7267 SV*
7268 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7269                          const U32 flags)
7270 {
7271     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7272     PERL_UNUSED_ARG(lastkey);
7273
7274     if (flags & RXapif_FIRSTKEY)
7275         return reg_named_buff_firstkey(rx, flags);
7276     else if (flags & RXapif_NEXTKEY)
7277         return reg_named_buff_nextkey(rx, flags);
7278     else {
7279         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7280                                             (int)flags);
7281         return NULL;
7282     }
7283 }
7284
7285 SV*
7286 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7287                           const U32 flags)
7288 {
7289     AV *retarray = NULL;
7290     SV *ret;
7291     struct regexp *const rx = ReANY(r);
7292
7293     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7294
7295     if (flags & RXapif_ALL)
7296         retarray=newAV();
7297
7298     if (rx && RXp_PAREN_NAMES(rx)) {
7299         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7300         if (he_str) {
7301             IV i;
7302             SV* sv_dat=HeVAL(he_str);
7303             I32 *nums=(I32*)SvPVX(sv_dat);
7304             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7305                 if ((I32)(rx->nparens) >= nums[i]
7306                     && rx->offs[nums[i]].start != -1
7307                     && rx->offs[nums[i]].end != -1)
7308                 {
7309                     ret = newSVpvs("");
7310                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7311                     if (!retarray)
7312                         return ret;
7313                 } else {
7314                     if (retarray)
7315                         ret = newSVsv(&PL_sv_undef);
7316                 }
7317                 if (retarray)
7318                     av_push(retarray, ret);
7319             }
7320             if (retarray)
7321                 return newRV_noinc(MUTABLE_SV(retarray));
7322         }
7323     }
7324     return NULL;
7325 }
7326
7327 bool
7328 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7329                            const U32 flags)
7330 {
7331     struct regexp *const rx = ReANY(r);
7332
7333     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7334
7335     if (rx && RXp_PAREN_NAMES(rx)) {
7336         if (flags & RXapif_ALL) {
7337             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7338         } else {
7339             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7340             if (sv) {
7341                 SvREFCNT_dec_NN(sv);
7342                 return TRUE;
7343             } else {
7344                 return FALSE;
7345             }
7346         }
7347     } else {
7348         return FALSE;
7349     }
7350 }
7351
7352 SV*
7353 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7354 {
7355     struct regexp *const rx = ReANY(r);
7356
7357     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7358
7359     if ( rx && RXp_PAREN_NAMES(rx) ) {
7360         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7361
7362         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7363     } else {
7364         return FALSE;
7365     }
7366 }
7367
7368 SV*
7369 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7370 {
7371     struct regexp *const rx = ReANY(r);
7372     GET_RE_DEBUG_FLAGS_DECL;
7373
7374     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7375
7376     if (rx && RXp_PAREN_NAMES(rx)) {
7377         HV *hv = RXp_PAREN_NAMES(rx);
7378         HE *temphe;
7379         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7380             IV i;
7381             IV parno = 0;
7382             SV* sv_dat = HeVAL(temphe);
7383             I32 *nums = (I32*)SvPVX(sv_dat);
7384             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7385                 if ((I32)(rx->lastparen) >= nums[i] &&
7386                     rx->offs[nums[i]].start != -1 &&
7387                     rx->offs[nums[i]].end != -1)
7388                 {
7389                     parno = nums[i];
7390                     break;
7391                 }
7392             }
7393             if (parno || flags & RXapif_ALL) {
7394                 return newSVhek(HeKEY_hek(temphe));
7395             }
7396         }
7397     }
7398     return NULL;
7399 }
7400
7401 SV*
7402 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7403 {
7404     SV *ret;
7405     AV *av;
7406     SSize_t length;
7407     struct regexp *const rx = ReANY(r);
7408
7409     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7410
7411     if (rx && RXp_PAREN_NAMES(rx)) {
7412         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7413             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7414         } else if (flags & RXapif_ONE) {
7415             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7416             av = MUTABLE_AV(SvRV(ret));
7417             length = av_tindex(av);
7418             SvREFCNT_dec_NN(ret);
7419             return newSViv(length + 1);
7420         } else {
7421             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7422                                                 (int)flags);
7423             return NULL;
7424         }
7425     }
7426     return &PL_sv_undef;
7427 }
7428
7429 SV*
7430 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7431 {
7432     struct regexp *const rx = ReANY(r);
7433     AV *av = newAV();
7434
7435     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7436
7437     if (rx && RXp_PAREN_NAMES(rx)) {
7438         HV *hv= RXp_PAREN_NAMES(rx);
7439         HE *temphe;
7440         (void)hv_iterinit(hv);
7441         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7442             IV i;
7443             IV parno = 0;
7444             SV* sv_dat = HeVAL(temphe);
7445             I32 *nums = (I32*)SvPVX(sv_dat);
7446             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7447                 if ((I32)(rx->lastparen) >= nums[i] &&
7448                     rx->offs[nums[i]].start != -1 &&
7449                     rx->offs[nums[i]].end != -1)
7450                 {
7451                     parno = nums[i];
7452                     break;
7453                 }
7454             }
7455             if (parno || flags & RXapif_ALL) {
7456                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7457             }
7458         }
7459     }
7460
7461     return newRV_noinc(MUTABLE_SV(av));
7462 }
7463
7464 void
7465 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7466                              SV * const sv)
7467 {
7468     struct regexp *const rx = ReANY(r);
7469     char *s = NULL;
7470     SSize_t i = 0;
7471     SSize_t s1, t1;
7472     I32 n = paren;
7473
7474     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7475
7476     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7477            || n == RX_BUFF_IDX_CARET_FULLMATCH
7478            || n == RX_BUFF_IDX_CARET_POSTMATCH
7479        )
7480     {
7481         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7482         if (!keepcopy) {
7483             /* on something like
7484              *    $r = qr/.../;
7485              *    /$qr/p;
7486              * the KEEPCOPY is set on the PMOP rather than the regex */
7487             if (PL_curpm && r == PM_GETRE(PL_curpm))
7488                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7489         }
7490         if (!keepcopy)
7491             goto ret_undef;
7492     }
7493
7494     if (!rx->subbeg)
7495         goto ret_undef;
7496
7497     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7498         /* no need to distinguish between them any more */
7499         n = RX_BUFF_IDX_FULLMATCH;
7500
7501     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7502         && rx->offs[0].start != -1)
7503     {
7504         /* $`, ${^PREMATCH} */
7505         i = rx->offs[0].start;
7506         s = rx->subbeg;
7507     }
7508     else
7509     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7510         && rx->offs[0].end != -1)
7511     {
7512         /* $', ${^POSTMATCH} */
7513         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7514         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7515     }
7516     else
7517     if ( 0 <= n && n <= (I32)rx->nparens &&
7518         (s1 = rx->offs[n].start) != -1 &&
7519         (t1 = rx->offs[n].end) != -1)
7520     {
7521         /* $&, ${^MATCH},  $1 ... */
7522         i = t1 - s1;
7523         s = rx->subbeg + s1 - rx->suboffset;
7524     } else {
7525         goto ret_undef;
7526     }
7527
7528     assert(s >= rx->subbeg);
7529     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7530     if (i >= 0) {
7531 #ifdef NO_TAINT_SUPPORT
7532         sv_setpvn(sv, s, i);
7533 #else
7534         const int oldtainted = TAINT_get;
7535         TAINT_NOT;
7536         sv_setpvn(sv, s, i);
7537         TAINT_set(oldtainted);
7538 #endif
7539         if ( (rx->intflags & PREGf_CANY_SEEN)
7540             ? (RXp_MATCH_UTF8(rx)
7541                         && (!i || is_utf8_string((U8*)s, i)))
7542             : (RXp_MATCH_UTF8(rx)) )
7543         {
7544             SvUTF8_on(sv);
7545         }
7546         else
7547             SvUTF8_off(sv);
7548         if (TAINTING_get) {
7549             if (RXp_MATCH_TAINTED(rx)) {
7550                 if (SvTYPE(sv) >= SVt_PVMG) {
7551                     MAGIC* const mg = SvMAGIC(sv);
7552                     MAGIC* mgt;
7553                     TAINT;
7554                     SvMAGIC_set(sv, mg->mg_moremagic);
7555                     SvTAINT(sv);
7556                     if ((mgt = SvMAGIC(sv))) {
7557                         mg->mg_moremagic = mgt;
7558                         SvMAGIC_set(sv, mg);
7559                     }
7560                 } else {
7561                     TAINT;
7562                     SvTAINT(sv);
7563                 }
7564             } else
7565                 SvTAINTED_off(sv);
7566         }
7567     } else {
7568       ret_undef:
7569         sv_setsv(sv,&PL_sv_undef);
7570         return;
7571     }
7572 }
7573
7574 void
7575 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7576                                                          SV const * const value)
7577 {
7578     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7579
7580     PERL_UNUSED_ARG(rx);
7581     PERL_UNUSED_ARG(paren);
7582     PERL_UNUSED_ARG(value);
7583
7584     if (!PL_localizing)
7585         Perl_croak_no_modify();
7586 }
7587
7588 I32
7589 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7590                               const I32 paren)
7591 {
7592     struct regexp *const rx = ReANY(r);
7593     I32 i;
7594     I32 s1, t1;
7595
7596     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7597
7598     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7599         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7600         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7601     )
7602     {
7603         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7604         if (!keepcopy) {
7605             /* on something like
7606              *    $r = qr/.../;
7607              *    /$qr/p;
7608              * the KEEPCOPY is set on the PMOP rather than the regex */
7609             if (PL_curpm && r == PM_GETRE(PL_curpm))
7610                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7611         }
7612         if (!keepcopy)
7613             goto warn_undef;
7614     }
7615
7616     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7617     switch (paren) {
7618       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7619       case RX_BUFF_IDX_PREMATCH:       /* $` */
7620         if (rx->offs[0].start != -1) {
7621                         i = rx->offs[0].start;
7622                         if (i > 0) {
7623                                 s1 = 0;
7624                                 t1 = i;
7625                                 goto getlen;
7626                         }
7627             }
7628         return 0;
7629
7630       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7631       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7632             if (rx->offs[0].end != -1) {
7633                         i = rx->sublen - rx->offs[0].end;
7634                         if (i > 0) {
7635                                 s1 = rx->offs[0].end;
7636                                 t1 = rx->sublen;
7637                                 goto getlen;
7638                         }
7639             }
7640         return 0;
7641
7642       default: /* $& / ${^MATCH}, $1, $2, ... */
7643             if (paren <= (I32)rx->nparens &&
7644             (s1 = rx->offs[paren].start) != -1 &&
7645             (t1 = rx->offs[paren].end) != -1)
7646             {
7647             i = t1 - s1;
7648             goto getlen;
7649         } else {
7650           warn_undef:
7651             if (ckWARN(WARN_UNINITIALIZED))
7652                 report_uninit((const SV *)sv);
7653             return 0;
7654         }
7655     }
7656   getlen:
7657     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7658         const char * const s = rx->subbeg - rx->suboffset + s1;
7659         const U8 *ep;
7660         STRLEN el;
7661
7662         i = t1 - s1;
7663         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7664                         i = el;
7665     }
7666     return i;
7667 }
7668
7669 SV*
7670 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7671 {
7672     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7673         PERL_UNUSED_ARG(rx);
7674         if (0)
7675             return NULL;
7676         else
7677             return newSVpvs("Regexp");
7678 }
7679
7680 /* Scans the name of a named buffer from the pattern.
7681  * If flags is REG_RSN_RETURN_NULL returns null.
7682  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7683  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7684  * to the parsed name as looked up in the RExC_paren_names hash.
7685  * If there is an error throws a vFAIL().. type exception.
7686  */
7687
7688 #define REG_RSN_RETURN_NULL    0
7689 #define REG_RSN_RETURN_NAME    1
7690 #define REG_RSN_RETURN_DATA    2
7691
7692 STATIC SV*
7693 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7694 {
7695     char *name_start = RExC_parse;
7696
7697     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7698
7699     assert (RExC_parse <= RExC_end);
7700     if (RExC_parse == RExC_end) NOOP;
7701     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7702          /* skip IDFIRST by using do...while */
7703         if (UTF)
7704             do {
7705                 RExC_parse += UTF8SKIP(RExC_parse);
7706             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7707         else
7708             do {
7709                 RExC_parse++;
7710             } while (isWORDCHAR(*RExC_parse));
7711     } else {
7712         RExC_parse++; /* so the <- from the vFAIL is after the offending
7713                          character */
7714         vFAIL("Group name must start with a non-digit word character");
7715     }
7716     if ( flags ) {
7717         SV* sv_name
7718             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7719                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7720         if ( flags == REG_RSN_RETURN_NAME)
7721             return sv_name;
7722         else if (flags==REG_RSN_RETURN_DATA) {
7723             HE *he_str = NULL;
7724             SV *sv_dat = NULL;
7725             if ( ! sv_name )      /* should not happen*/
7726                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7727             if (RExC_paren_names)
7728                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7729             if ( he_str )
7730                 sv_dat = HeVAL(he_str);
7731             if ( ! sv_dat )
7732                 vFAIL("Reference to nonexistent named group");
7733             return sv_dat;
7734         }
7735         else {
7736             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7737                        (unsigned long) flags);
7738         }
7739         assert(0); /* NOT REACHED */
7740     }
7741     return NULL;
7742 }
7743
7744 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7745     int rem=(int)(RExC_end - RExC_parse);                       \
7746     int cut;                                                    \
7747     int num;                                                    \
7748     int iscut=0;                                                \
7749     if (rem>10) {                                               \
7750         rem=10;                                                 \
7751         iscut=1;                                                \
7752     }                                                           \
7753     cut=10-rem;                                                 \
7754     if (RExC_lastparse!=RExC_parse)                             \
7755         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7756             rem, RExC_parse,                                    \
7757             cut + 4,                                            \
7758             iscut ? "..." : "<"                                 \
7759         );                                                      \
7760     else                                                        \
7761         PerlIO_printf(Perl_debug_log,"%16s","");                \
7762                                                                 \
7763     if (SIZE_ONLY)                                              \
7764        num = RExC_size + 1;                                     \
7765     else                                                        \
7766        num=REG_NODE_NUM(RExC_emit);                             \
7767     if (RExC_lastnum!=num)                                      \
7768        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7769     else                                                        \
7770        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7771     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7772         (int)((depth*2)), "",                                   \
7773         (funcname)                                              \
7774     );                                                          \
7775     RExC_lastnum=num;                                           \
7776     RExC_lastparse=RExC_parse;                                  \
7777 })
7778
7779
7780
7781 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7782     DEBUG_PARSE_MSG((funcname));                            \
7783     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7784 })
7785 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7786     DEBUG_PARSE_MSG((funcname));                            \
7787     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7788 })
7789
7790 /* This section of code defines the inversion list object and its methods.  The
7791  * interfaces are highly subject to change, so as much as possible is static to
7792  * this file.  An inversion list is here implemented as a malloc'd C UV array
7793  * as an SVt_INVLIST scalar.
7794  *
7795  * An inversion list for Unicode is an array of code points, sorted by ordinal
7796  * number.  The zeroth element is the first code point in the list.  The 1th
7797  * element is the first element beyond that not in the list.  In other words,
7798  * the first range is
7799  *  invlist[0]..(invlist[1]-1)
7800  * The other ranges follow.  Thus every element whose index is divisible by two
7801  * marks the beginning of a range that is in the list, and every element not
7802  * divisible by two marks the beginning of a range not in the list.  A single
7803  * element inversion list that contains the single code point N generally
7804  * consists of two elements
7805  *  invlist[0] == N
7806  *  invlist[1] == N+1
7807  * (The exception is when N is the highest representable value on the
7808  * machine, in which case the list containing just it would be a single
7809  * element, itself.  By extension, if the last range in the list extends to
7810  * infinity, then the first element of that range will be in the inversion list
7811  * at a position that is divisible by two, and is the final element in the
7812  * list.)
7813  * Taking the complement (inverting) an inversion list is quite simple, if the
7814  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7815  * This implementation reserves an element at the beginning of each inversion
7816  * list to always contain 0; there is an additional flag in the header which
7817  * indicates if the list begins at the 0, or is offset to begin at the next
7818  * element.
7819  *
7820  * More about inversion lists can be found in "Unicode Demystified"
7821  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7822  * More will be coming when functionality is added later.
7823  *
7824  * The inversion list data structure is currently implemented as an SV pointing
7825  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7826  * array of UV whose memory management is automatically handled by the existing
7827  * facilities for SV's.
7828  *
7829  * Some of the methods should always be private to the implementation, and some
7830  * should eventually be made public */
7831
7832 /* The header definitions are in F<inline_invlist.c> */
7833
7834 PERL_STATIC_INLINE UV*
7835 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7836 {
7837     /* Returns a pointer to the first element in the inversion list's array.
7838      * This is called upon initialization of an inversion list.  Where the
7839      * array begins depends on whether the list has the code point U+0000 in it
7840      * or not.  The other parameter tells it whether the code that follows this
7841      * call is about to put a 0 in the inversion list or not.  The first
7842      * element is either the element reserved for 0, if TRUE, or the element
7843      * after it, if FALSE */
7844
7845     bool* offset = get_invlist_offset_addr(invlist);
7846     UV* zero_addr = (UV *) SvPVX(invlist);
7847
7848     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7849
7850     /* Must be empty */
7851     assert(! _invlist_len(invlist));
7852
7853     *zero_addr = 0;
7854
7855     /* 1^1 = 0; 1^0 = 1 */
7856     *offset = 1 ^ will_have_0;
7857     return zero_addr + *offset;
7858 }
7859
7860 PERL_STATIC_INLINE UV*
7861 S_invlist_array(pTHX_ SV* const invlist)
7862 {
7863     /* Returns the pointer to the inversion list's array.  Every time the
7864      * length changes, this needs to be called in case malloc or realloc moved
7865      * it */
7866
7867     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7868
7869     /* Must not be empty.  If these fail, you probably didn't check for <len>
7870      * being non-zero before trying to get the array */
7871     assert(_invlist_len(invlist));
7872
7873     /* The very first element always contains zero, The array begins either
7874      * there, or if the inversion list is offset, at the element after it.
7875      * The offset header field determines which; it contains 0 or 1 to indicate
7876      * how much additionally to add */
7877     assert(0 == *(SvPVX(invlist)));
7878     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7879 }
7880
7881 PERL_STATIC_INLINE void
7882 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7883 {
7884     /* Sets the current number of elements stored in the inversion list.
7885      * Updates SvCUR correspondingly */
7886
7887     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7888
7889     assert(SvTYPE(invlist) == SVt_INVLIST);
7890
7891     SvCUR_set(invlist,
7892               (len == 0)
7893                ? 0
7894                : TO_INTERNAL_SIZE(len + offset));
7895     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7896 }
7897
7898 PERL_STATIC_INLINE IV*
7899 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7900 {
7901     /* Return the address of the IV that is reserved to hold the cached index
7902      * */
7903
7904     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7905
7906     assert(SvTYPE(invlist) == SVt_INVLIST);
7907
7908     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7909 }
7910
7911 PERL_STATIC_INLINE IV
7912 S_invlist_previous_index(pTHX_ SV* const invlist)
7913 {
7914     /* Returns cached index of previous search */
7915
7916     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7917
7918     return *get_invlist_previous_index_addr(invlist);
7919 }
7920
7921 PERL_STATIC_INLINE void
7922 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7923 {
7924     /* Caches <index> for later retrieval */
7925
7926     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7927
7928     assert(index == 0 || index < (int) _invlist_len(invlist));
7929
7930     *get_invlist_previous_index_addr(invlist) = index;
7931 }
7932
7933 PERL_STATIC_INLINE UV
7934 S_invlist_max(pTHX_ SV* const invlist)
7935 {
7936     /* Returns the maximum number of elements storable in the inversion list's
7937      * array, without having to realloc() */
7938
7939     PERL_ARGS_ASSERT_INVLIST_MAX;
7940
7941     assert(SvTYPE(invlist) == SVt_INVLIST);
7942
7943     /* Assumes worst case, in which the 0 element is not counted in the
7944      * inversion list, so subtracts 1 for that */
7945     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7946            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7947            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7948 }
7949
7950 #ifndef PERL_IN_XSUB_RE
7951 SV*
7952 Perl__new_invlist(pTHX_ IV initial_size)
7953 {
7954
7955     /* Return a pointer to a newly constructed inversion list, with enough
7956      * space to store 'initial_size' elements.  If that number is negative, a
7957      * system default is used instead */
7958
7959     SV* new_list;
7960
7961     if (initial_size < 0) {
7962         initial_size = 10;
7963     }
7964
7965     /* Allocate the initial space */
7966     new_list = newSV_type(SVt_INVLIST);
7967
7968     /* First 1 is in case the zero element isn't in the list; second 1 is for
7969      * trailing NUL */
7970     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7971     invlist_set_len(new_list, 0, 0);
7972
7973     /* Force iterinit() to be used to get iteration to work */
7974     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7975
7976     *get_invlist_previous_index_addr(new_list) = 0;
7977
7978     return new_list;
7979 }
7980
7981 SV*
7982 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7983 {
7984     /* Return a pointer to a newly constructed inversion list, initialized to
7985      * point to <list>, which has to be in the exact correct inversion list
7986      * form, including internal fields.  Thus this is a dangerous routine that
7987      * should not be used in the wrong hands.  The passed in 'list' contains
7988      * several header fields at the beginning that are not part of the
7989      * inversion list body proper */
7990
7991     const STRLEN length = (STRLEN) list[0];
7992     const UV version_id =          list[1];
7993     const bool offset   =    cBOOL(list[2]);
7994 #define HEADER_LENGTH 3
7995     /* If any of the above changes in any way, you must change HEADER_LENGTH
7996      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7997      *      perl -E 'say int(rand 2**31-1)'
7998      */
7999 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8000                                         data structure type, so that one being
8001                                         passed in can be validated to be an
8002                                         inversion list of the correct vintage.
8003                                        */
8004
8005     SV* invlist = newSV_type(SVt_INVLIST);
8006
8007     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8008
8009     if (version_id != INVLIST_VERSION_ID) {
8010         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8011     }
8012
8013     /* The generated array passed in includes header elements that aren't part
8014      * of the list proper, so start it just after them */
8015     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8016
8017     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8018                                shouldn't touch it */
8019
8020     *(get_invlist_offset_addr(invlist)) = offset;
8021
8022     /* The 'length' passed to us is the physical number of elements in the
8023      * inversion list.  But if there is an offset the logical number is one
8024      * less than that */
8025     invlist_set_len(invlist, length  - offset, offset);
8026
8027     invlist_set_previous_index(invlist, 0);
8028
8029     /* Initialize the iteration pointer. */
8030     invlist_iterfinish(invlist);
8031
8032     SvREADONLY_on(invlist);
8033
8034     return invlist;
8035 }
8036 #endif /* ifndef PERL_IN_XSUB_RE */
8037
8038 STATIC void
8039 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8040 {
8041     /* Grow the maximum size of an inversion list */
8042
8043     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8044
8045     assert(SvTYPE(invlist) == SVt_INVLIST);
8046
8047     /* Add one to account for the zero element at the beginning which may not
8048      * be counted by the calling parameters */
8049     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8050 }
8051
8052 PERL_STATIC_INLINE void
8053 S_invlist_trim(pTHX_ SV* const invlist)
8054 {
8055     PERL_ARGS_ASSERT_INVLIST_TRIM;
8056
8057     assert(SvTYPE(invlist) == SVt_INVLIST);
8058
8059     /* Change the length of the inversion list to how many entries it currently
8060      * has */
8061     SvPV_shrink_to_cur((SV *) invlist);
8062 }
8063
8064 STATIC void
8065 S__append_range_to_invlist(pTHX_ SV* const invlist,
8066                                  const UV start, const UV end)
8067 {
8068    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8069     * the end of the inversion list.  The range must be above any existing
8070     * ones. */
8071
8072     UV* array;
8073     UV max = invlist_max(invlist);
8074     UV len = _invlist_len(invlist);
8075     bool offset;
8076
8077     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8078
8079     if (len == 0) { /* Empty lists must be initialized */
8080         offset = start != 0;
8081         array = _invlist_array_init(invlist, ! offset);
8082     }
8083     else {
8084         /* Here, the existing list is non-empty. The current max entry in the
8085          * list is generally the first value not in the set, except when the
8086          * set extends to the end of permissible values, in which case it is
8087          * the first entry in that final set, and so this call is an attempt to
8088          * append out-of-order */
8089
8090         UV final_element = len - 1;
8091         array = invlist_array(invlist);
8092         if (array[final_element] > start
8093             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8094         {
8095             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8096                      array[final_element], start,
8097                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8098         }
8099
8100         /* Here, it is a legal append.  If the new range begins with the first
8101          * value not in the set, it is extending the set, so the new first
8102          * value not in the set is one greater than the newly extended range.
8103          * */
8104         offset = *get_invlist_offset_addr(invlist);
8105         if (array[final_element] == start) {
8106             if (end != UV_MAX) {
8107                 array[final_element] = end + 1;
8108             }
8109             else {
8110                 /* But if the end is the maximum representable on the machine,
8111                  * just let the range that this would extend to have no end */
8112                 invlist_set_len(invlist, len - 1, offset);
8113             }
8114             return;
8115         }
8116     }
8117
8118     /* Here the new range doesn't extend any existing set.  Add it */
8119
8120     len += 2;   /* Includes an element each for the start and end of range */
8121
8122     /* If wll overflow the existing space, extend, which may cause the array to
8123      * be moved */
8124     if (max < len) {
8125         invlist_extend(invlist, len);
8126
8127         /* Have to set len here to avoid assert failure in invlist_array() */
8128         invlist_set_len(invlist, len, offset);
8129
8130         array = invlist_array(invlist);
8131     }
8132     else {
8133         invlist_set_len(invlist, len, offset);
8134     }
8135
8136     /* The next item on the list starts the range, the one after that is
8137      * one past the new range.  */
8138     array[len - 2] = start;
8139     if (end != UV_MAX) {
8140         array[len - 1] = end + 1;
8141     }
8142     else {
8143         /* But if the end is the maximum representable on the machine, just let
8144          * the range have no end */
8145         invlist_set_len(invlist, len - 1, offset);
8146     }
8147 }
8148
8149 #ifndef PERL_IN_XSUB_RE
8150
8151 IV
8152 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8153 {
8154     /* Searches the inversion list for the entry that contains the input code
8155      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8156      * return value is the index into the list's array of the range that
8157      * contains <cp> */
8158
8159     IV low = 0;
8160     IV mid;
8161     IV high = _invlist_len(invlist);
8162     const IV highest_element = high - 1;
8163     const UV* array;
8164
8165     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8166
8167     /* If list is empty, return failure. */
8168     if (high == 0) {
8169         return -1;
8170     }
8171
8172     /* (We can't get the array unless we know the list is non-empty) */
8173     array = invlist_array(invlist);
8174
8175     mid = invlist_previous_index(invlist);
8176     assert(mid >=0 && mid <= highest_element);
8177
8178     /* <mid> contains the cache of the result of the previous call to this
8179      * function (0 the first time).  See if this call is for the same result,
8180      * or if it is for mid-1.  This is under the theory that calls to this
8181      * function will often be for related code points that are near each other.
8182      * And benchmarks show that caching gives better results.  We also test
8183      * here if the code point is within the bounds of the list.  These tests
8184      * replace others that would have had to be made anyway to make sure that
8185      * the array bounds were not exceeded, and these give us extra information
8186      * at the same time */
8187     if (cp >= array[mid]) {
8188         if (cp >= array[highest_element]) {
8189             return highest_element;
8190         }
8191
8192         /* Here, array[mid] <= cp < array[highest_element].  This means that
8193          * the final element is not the answer, so can exclude it; it also
8194          * means that <mid> is not the final element, so can refer to 'mid + 1'
8195          * safely */
8196         if (cp < array[mid + 1]) {
8197             return mid;
8198         }
8199         high--;
8200         low = mid + 1;
8201     }
8202     else { /* cp < aray[mid] */
8203         if (cp < array[0]) { /* Fail if outside the array */
8204             return -1;
8205         }
8206         high = mid;
8207         if (cp >= array[mid - 1]) {
8208             goto found_entry;
8209         }
8210     }
8211
8212     /* Binary search.  What we are looking for is <i> such that
8213      *  array[i] <= cp < array[i+1]
8214      * The loop below converges on the i+1.  Note that there may not be an
8215      * (i+1)th element in the array, and things work nonetheless */
8216     while (low < high) {
8217         mid = (low + high) / 2;
8218         assert(mid <= highest_element);
8219         if (array[mid] <= cp) { /* cp >= array[mid] */
8220             low = mid + 1;
8221
8222             /* We could do this extra test to exit the loop early.
8223             if (cp < array[low]) {
8224                 return mid;
8225             }
8226             */
8227         }
8228         else { /* cp < array[mid] */
8229             high = mid;
8230         }
8231     }
8232
8233   found_entry:
8234     high--;
8235     invlist_set_previous_index(invlist, high);
8236     return high;
8237 }
8238
8239 void
8240 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8241                                     const UV start, const UV end, U8* swatch)
8242 {
8243     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8244      * but is used when the swash has an inversion list.  This makes this much
8245      * faster, as it uses a binary search instead of a linear one.  This is
8246      * intimately tied to that function, and perhaps should be in utf8.c,
8247      * except it is intimately tied to inversion lists as well.  It assumes
8248      * that <swatch> is all 0's on input */
8249
8250     UV current = start;
8251     const IV len = _invlist_len(invlist);
8252     IV i;
8253     const UV * array;
8254
8255     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8256
8257     if (len == 0) { /* Empty inversion list */
8258         return;
8259     }
8260
8261     array = invlist_array(invlist);
8262
8263     /* Find which element it is */
8264     i = _invlist_search(invlist, start);
8265
8266     /* We populate from <start> to <end> */
8267     while (current < end) {
8268         UV upper;
8269
8270         /* The inversion list gives the results for every possible code point
8271          * after the first one in the list.  Only those ranges whose index is
8272          * even are ones that the inversion list matches.  For the odd ones,
8273          * and if the initial code point is not in the list, we have to skip
8274          * forward to the next element */
8275         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8276             i++;
8277             if (i >= len) { /* Finished if beyond the end of the array */
8278                 return;
8279             }
8280             current = array[i];
8281             if (current >= end) {   /* Finished if beyond the end of what we
8282                                        are populating */
8283                 if (LIKELY(end < UV_MAX)) {
8284                     return;
8285                 }
8286
8287                 /* We get here when the upper bound is the maximum
8288                  * representable on the machine, and we are looking for just
8289                  * that code point.  Have to special case it */
8290                 i = len;
8291                 goto join_end_of_list;
8292             }
8293         }
8294         assert(current >= start);
8295
8296         /* The current range ends one below the next one, except don't go past
8297          * <end> */
8298         i++;
8299         upper = (i < len && array[i] < end) ? array[i] : end;
8300
8301         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8302          * for each code point in it */
8303         for (; current < upper; current++) {
8304             const STRLEN offset = (STRLEN)(current - start);
8305             swatch[offset >> 3] |= 1 << (offset & 7);
8306         }
8307
8308     join_end_of_list:
8309
8310         /* Quit if at the end of the list */
8311         if (i >= len) {
8312
8313             /* But first, have to deal with the highest possible code point on
8314              * the platform.  The previous code assumes that <end> is one
8315              * beyond where we want to populate, but that is impossible at the
8316              * platform's infinity, so have to handle it specially */
8317             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8318             {
8319                 const STRLEN offset = (STRLEN)(end - start);
8320                 swatch[offset >> 3] |= 1 << (offset & 7);
8321             }
8322             return;
8323         }
8324
8325         /* Advance to the next range, which will be for code points not in the
8326          * inversion list */
8327         current = array[i];
8328     }
8329
8330     return;
8331 }
8332
8333 void
8334 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8335                                          const bool complement_b, SV** output)
8336 {
8337     /* Take the union of two inversion lists and point <output> to it.  *output
8338      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8339      * the reference count to that list will be decremented if not already a
8340      * temporary (mortal); otherwise *output will be made correspondingly
8341      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8342      * second list is returned.  If <complement_b> is TRUE, the union is taken
8343      * of the complement (inversion) of <b> instead of b itself.
8344      *
8345      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8346      * Richard Gillam, published by Addison-Wesley, and explained at some
8347      * length there.  The preface says to incorporate its examples into your
8348      * code at your own risk.
8349      *
8350      * The algorithm is like a merge sort.
8351      *
8352      * XXX A potential performance improvement is to keep track as we go along
8353      * if only one of the inputs contributes to the result, meaning the other
8354      * is a subset of that one.  In that case, we can skip the final copy and
8355      * return the larger of the input lists, but then outside code might need
8356      * to keep track of whether to free the input list or not */
8357
8358     const UV* array_a;    /* a's array */
8359     const UV* array_b;
8360     UV len_a;       /* length of a's array */
8361     UV len_b;
8362
8363     SV* u;                      /* the resulting union */
8364     UV* array_u;
8365     UV len_u;
8366
8367     UV i_a = 0;             /* current index into a's array */
8368     UV i_b = 0;
8369     UV i_u = 0;
8370
8371     /* running count, as explained in the algorithm source book; items are
8372      * stopped accumulating and are output when the count changes to/from 0.
8373      * The count is incremented when we start a range that's in the set, and
8374      * decremented when we start a range that's not in the set.  So its range
8375      * is 0 to 2.  Only when the count is zero is something not in the set.
8376      */
8377     UV count = 0;
8378
8379     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8380     assert(a != b);
8381
8382     /* If either one is empty, the union is the other one */
8383     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8384         bool make_temp = FALSE; /* Should we mortalize the result? */
8385
8386         if (*output == a) {
8387             if (a != NULL) {
8388                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8389                     SvREFCNT_dec_NN(a);
8390                 }
8391             }
8392         }
8393         if (*output != b) {
8394             *output = invlist_clone(b);
8395             if (complement_b) {
8396                 _invlist_invert(*output);
8397             }
8398         } /* else *output already = b; */
8399
8400         if (make_temp) {
8401             sv_2mortal(*output);
8402         }
8403         return;
8404     }
8405     else if ((len_b = _invlist_len(b)) == 0) {
8406         bool make_temp = FALSE;
8407         if (*output == b) {
8408             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8409                 SvREFCNT_dec_NN(b);
8410             }
8411         }
8412
8413         /* The complement of an empty list is a list that has everything in it,
8414          * so the union with <a> includes everything too */
8415         if (complement_b) {
8416             if (a == *output) {
8417                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8418                     SvREFCNT_dec_NN(a);
8419                 }
8420             }
8421             *output = _new_invlist(1);
8422             _append_range_to_invlist(*output, 0, UV_MAX);
8423         }
8424         else if (*output != a) {
8425             *output = invlist_clone(a);
8426         }
8427         /* else *output already = a; */
8428
8429         if (make_temp) {
8430             sv_2mortal(*output);
8431         }
8432         return;
8433     }
8434
8435     /* Here both lists exist and are non-empty */
8436     array_a = invlist_array(a);
8437     array_b = invlist_array(b);
8438
8439     /* If are to take the union of 'a' with the complement of b, set it
8440      * up so are looking at b's complement. */
8441     if (complement_b) {
8442
8443         /* To complement, we invert: if the first element is 0, remove it.  To
8444          * do this, we just pretend the array starts one later */
8445         if (array_b[0] == 0) {
8446             array_b++;
8447             len_b--;
8448         }
8449         else {
8450
8451             /* But if the first element is not zero, we pretend the list starts
8452              * at the 0 that is always stored immediately before the array. */
8453             array_b--;
8454             len_b++;
8455         }
8456     }
8457
8458     /* Size the union for the worst case: that the sets are completely
8459      * disjoint */
8460     u = _new_invlist(len_a + len_b);
8461
8462     /* Will contain U+0000 if either component does */
8463     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8464                                       || (len_b > 0 && array_b[0] == 0));
8465
8466     /* Go through each list item by item, stopping when exhausted one of
8467      * them */
8468     while (i_a < len_a && i_b < len_b) {
8469         UV cp;      /* The element to potentially add to the union's array */
8470         bool cp_in_set;   /* is it in the the input list's set or not */
8471
8472         /* We need to take one or the other of the two inputs for the union.
8473          * Since we are merging two sorted lists, we take the smaller of the
8474          * next items.  In case of a tie, we take the one that is in its set
8475          * first.  If we took one not in the set first, it would decrement the
8476          * count, possibly to 0 which would cause it to be output as ending the
8477          * range, and the next time through we would take the same number, and
8478          * output it again as beginning the next range.  By doing it the
8479          * opposite way, there is no possibility that the count will be
8480          * momentarily decremented to 0, and thus the two adjoining ranges will
8481          * be seamlessly merged.  (In a tie and both are in the set or both not
8482          * in the set, it doesn't matter which we take first.) */
8483         if (array_a[i_a] < array_b[i_b]
8484             || (array_a[i_a] == array_b[i_b]
8485                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8486         {
8487             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8488             cp= array_a[i_a++];
8489         }
8490         else {
8491             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8492             cp = array_b[i_b++];
8493         }
8494
8495         /* Here, have chosen which of the two inputs to look at.  Only output
8496          * if the running count changes to/from 0, which marks the
8497          * beginning/end of a range in that's in the set */
8498         if (cp_in_set) {
8499             if (count == 0) {
8500                 array_u[i_u++] = cp;
8501             }
8502             count++;
8503         }
8504         else {
8505             count--;
8506             if (count == 0) {
8507                 array_u[i_u++] = cp;
8508             }
8509         }
8510     }
8511
8512     /* Here, we are finished going through at least one of the lists, which
8513      * means there is something remaining in at most one.  We check if the list
8514      * that hasn't been exhausted is positioned such that we are in the middle
8515      * of a range in its set or not.  (i_a and i_b point to the element beyond
8516      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8517      * is potentially more to output.
8518      * There are four cases:
8519      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8520      *     in the union is entirely from the non-exhausted set.
8521      *  2) Both were in their sets, count is 2.  Nothing further should
8522      *     be output, as everything that remains will be in the exhausted
8523      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8524      *     that
8525      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8526      *     Nothing further should be output because the union includes
8527      *     everything from the exhausted set.  Not decrementing ensures that.
8528      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8529      *     decrementing to 0 insures that we look at the remainder of the
8530      *     non-exhausted set */
8531     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8532         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8533     {
8534         count--;
8535     }
8536
8537     /* The final length is what we've output so far, plus what else is about to
8538      * be output.  (If 'count' is non-zero, then the input list we exhausted
8539      * has everything remaining up to the machine's limit in its set, and hence
8540      * in the union, so there will be no further output. */
8541     len_u = i_u;
8542     if (count == 0) {
8543         /* At most one of the subexpressions will be non-zero */
8544         len_u += (len_a - i_a) + (len_b - i_b);
8545     }
8546
8547     /* Set result to final length, which can change the pointer to array_u, so
8548      * re-find it */
8549     if (len_u != _invlist_len(u)) {
8550         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8551         invlist_trim(u);
8552         array_u = invlist_array(u);
8553     }
8554
8555     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8556      * the other) ended with everything above it not in its set.  That means
8557      * that the remaining part of the union is precisely the same as the
8558      * non-exhausted list, so can just copy it unchanged.  (If both list were
8559      * exhausted at the same time, then the operations below will be both 0.)
8560      */
8561     if (count == 0) {
8562         IV copy_count; /* At most one will have a non-zero copy count */
8563         if ((copy_count = len_a - i_a) > 0) {
8564             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8565         }
8566         else if ((copy_count = len_b - i_b) > 0) {
8567             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8568         }
8569     }
8570
8571     /*  We may be removing a reference to one of the inputs.  If so, the output
8572      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8573      *  count decremented) */
8574     if (a == *output || b == *output) {
8575         assert(! invlist_is_iterating(*output));
8576         if ((SvTEMP(*output))) {
8577             sv_2mortal(u);
8578         }
8579         else {
8580             SvREFCNT_dec_NN(*output);
8581         }
8582     }
8583
8584     *output = u;
8585
8586     return;
8587 }
8588
8589 void
8590 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8591                                                const bool complement_b, SV** i)
8592 {
8593     /* Take the intersection of two inversion lists and point <i> to it.  *i
8594      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8595      * the reference count to that list will be decremented if not already a
8596      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8597      * The first list, <a>, may be NULL, in which case an empty list is
8598      * returned.  If <complement_b> is TRUE, the result will be the
8599      * intersection of <a> and the complement (or inversion) of <b> instead of
8600      * <b> directly.
8601      *
8602      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8603      * Richard Gillam, published by Addison-Wesley, and explained at some
8604      * length there.  The preface says to incorporate its examples into your
8605      * code at your own risk.  In fact, it had bugs
8606      *
8607      * The algorithm is like a merge sort, and is essentially the same as the
8608      * union above
8609      */
8610
8611     const UV* array_a;          /* a's array */
8612     const UV* array_b;
8613     UV len_a;   /* length of a's array */
8614     UV len_b;
8615
8616     SV* r;                   /* the resulting intersection */
8617     UV* array_r;
8618     UV len_r;
8619
8620     UV i_a = 0;             /* current index into a's array */
8621     UV i_b = 0;
8622     UV i_r = 0;
8623
8624     /* running count, as explained in the algorithm source book; items are
8625      * stopped accumulating and are output when the count changes to/from 2.
8626      * The count is incremented when we start a range that's in the set, and
8627      * decremented when we start a range that's not in the set.  So its range
8628      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8629      */
8630     UV count = 0;
8631
8632     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8633     assert(a != b);
8634
8635     /* Special case if either one is empty */
8636     len_a = (a == NULL) ? 0 : _invlist_len(a);
8637     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8638         bool make_temp = FALSE;
8639
8640         if (len_a != 0 && complement_b) {
8641
8642             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8643              * be empty.  Here, also we are using 'b's complement, which hence
8644              * must be every possible code point.  Thus the intersection is
8645              * simply 'a'. */
8646             if (*i != a) {
8647                 if (*i == b) {
8648                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8649                         SvREFCNT_dec_NN(b);
8650                     }
8651                 }
8652
8653                 *i = invlist_clone(a);
8654             }
8655             /* else *i is already 'a' */
8656
8657             if (make_temp) {
8658                 sv_2mortal(*i);
8659             }
8660             return;
8661         }
8662
8663         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8664          * intersection must be empty */
8665         if (*i == a) {
8666             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8667                 SvREFCNT_dec_NN(a);
8668             }
8669         }
8670         else if (*i == b) {
8671             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8672                 SvREFCNT_dec_NN(b);
8673             }
8674         }
8675         *i = _new_invlist(0);
8676         if (make_temp) {
8677             sv_2mortal(*i);
8678         }
8679
8680         return;
8681     }
8682
8683     /* Here both lists exist and are non-empty */
8684     array_a = invlist_array(a);
8685     array_b = invlist_array(b);
8686
8687     /* If are to take the intersection of 'a' with the complement of b, set it
8688      * up so are looking at b's complement. */
8689     if (complement_b) {
8690
8691         /* To complement, we invert: if the first element is 0, remove it.  To
8692          * do this, we just pretend the array starts one later */
8693         if (array_b[0] == 0) {
8694             array_b++;
8695             len_b--;
8696         }
8697         else {
8698
8699             /* But if the first element is not zero, we pretend the list starts
8700              * at the 0 that is always stored immediately before the array. */
8701             array_b--;
8702             len_b++;
8703         }
8704     }
8705
8706     /* Size the intersection for the worst case: that the intersection ends up
8707      * fragmenting everything to be completely disjoint */
8708     r= _new_invlist(len_a + len_b);
8709
8710     /* Will contain U+0000 iff both components do */
8711     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8712                                      && len_b > 0 && array_b[0] == 0);
8713
8714     /* Go through each list item by item, stopping when exhausted one of
8715      * them */
8716     while (i_a < len_a && i_b < len_b) {
8717         UV cp;      /* The element to potentially add to the intersection's
8718                        array */
8719         bool cp_in_set; /* Is it in the input list's set or not */
8720
8721         /* We need to take one or the other of the two inputs for the
8722          * intersection.  Since we are merging two sorted lists, we take the
8723          * smaller of the next items.  In case of a tie, we take the one that
8724          * is not in its set first (a difference from the union algorithm).  If
8725          * we took one in the set first, it would increment the count, possibly
8726          * to 2 which would cause it to be output as starting a range in the
8727          * intersection, and the next time through we would take that same
8728          * number, and output it again as ending the set.  By doing it the
8729          * opposite of this, there is no possibility that the count will be
8730          * momentarily incremented to 2.  (In a tie and both are in the set or
8731          * both not in the set, it doesn't matter which we take first.) */
8732         if (array_a[i_a] < array_b[i_b]
8733             || (array_a[i_a] == array_b[i_b]
8734                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8735         {
8736             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8737             cp= array_a[i_a++];
8738         }
8739         else {
8740             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8741             cp= array_b[i_b++];
8742         }
8743
8744         /* Here, have chosen which of the two inputs to look at.  Only output
8745          * if the running count changes to/from 2, which marks the
8746          * beginning/end of a range that's in the intersection */
8747         if (cp_in_set) {
8748             count++;
8749             if (count == 2) {
8750                 array_r[i_r++] = cp;
8751             }
8752         }
8753         else {
8754             if (count == 2) {
8755                 array_r[i_r++] = cp;
8756             }
8757             count--;
8758         }
8759     }
8760
8761     /* Here, we are finished going through at least one of the lists, which
8762      * means there is something remaining in at most one.  We check if the list
8763      * that has been exhausted is positioned such that we are in the middle
8764      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8765      * the ones we care about.)  There are four cases:
8766      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8767      *     nothing left in the intersection.
8768      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8769      *     above 2.  What should be output is exactly that which is in the
8770      *     non-exhausted set, as everything it has is also in the intersection
8771      *     set, and everything it doesn't have can't be in the intersection
8772      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8773      *     gets incremented to 2.  Like the previous case, the intersection is
8774      *     everything that remains in the non-exhausted set.
8775      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8776      *     remains 1.  And the intersection has nothing more. */
8777     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8778         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8779     {
8780         count++;
8781     }
8782
8783     /* The final length is what we've output so far plus what else is in the
8784      * intersection.  At most one of the subexpressions below will be non-zero
8785      * */
8786     len_r = i_r;
8787     if (count >= 2) {
8788         len_r += (len_a - i_a) + (len_b - i_b);
8789     }
8790
8791     /* Set result to final length, which can change the pointer to array_r, so
8792      * re-find it */
8793     if (len_r != _invlist_len(r)) {
8794         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8795         invlist_trim(r);
8796         array_r = invlist_array(r);
8797     }
8798
8799     /* Finish outputting any remaining */
8800     if (count >= 2) { /* At most one will have a non-zero copy count */
8801         IV copy_count;
8802         if ((copy_count = len_a - i_a) > 0) {
8803             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8804         }
8805         else if ((copy_count = len_b - i_b) > 0) {
8806             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8807         }
8808     }
8809
8810     /*  We may be removing a reference to one of the inputs.  If so, the output
8811      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8812      *  count decremented) */
8813     if (a == *i || b == *i) {
8814         assert(! invlist_is_iterating(*i));
8815         if (SvTEMP(*i)) {
8816             sv_2mortal(r);
8817         }
8818         else {
8819             SvREFCNT_dec_NN(*i);
8820         }
8821     }
8822
8823     *i = r;
8824
8825     return;
8826 }
8827
8828 SV*
8829 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8830 {
8831     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8832      * set.  A pointer to the inversion list is returned.  This may actually be
8833      * a new list, in which case the passed in one has been destroyed.  The
8834      * passed in inversion list can be NULL, in which case a new one is created
8835      * with just the one range in it */
8836
8837     SV* range_invlist;
8838     UV len;
8839
8840     if (invlist == NULL) {
8841         invlist = _new_invlist(2);
8842         len = 0;
8843     }
8844     else {
8845         len = _invlist_len(invlist);
8846     }
8847
8848     /* If comes after the final entry actually in the list, can just append it
8849      * to the end, */
8850     if (len == 0
8851         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8852             && start >= invlist_array(invlist)[len - 1]))
8853     {
8854         _append_range_to_invlist(invlist, start, end);
8855         return invlist;
8856     }
8857
8858     /* Here, can't just append things, create and return a new inversion list
8859      * which is the union of this range and the existing inversion list */
8860     range_invlist = _new_invlist(2);
8861     _append_range_to_invlist(range_invlist, start, end);
8862
8863     _invlist_union(invlist, range_invlist, &invlist);
8864
8865     /* The temporary can be freed */
8866     SvREFCNT_dec_NN(range_invlist);
8867
8868     return invlist;
8869 }
8870
8871 SV*
8872 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8873                                  UV** other_elements_ptr)
8874 {
8875     /* Create and return an inversion list whose contents are to be populated
8876      * by the caller.  The caller gives the number of elements (in 'size') and
8877      * the very first element ('element0').  This function will set
8878      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8879      * are to be placed.
8880      *
8881      * Obviously there is some trust involved that the caller will properly
8882      * fill in the other elements of the array.
8883      *
8884      * (The first element needs to be passed in, as the underlying code does
8885      * things differently depending on whether it is zero or non-zero) */
8886
8887     SV* invlist = _new_invlist(size);
8888     bool offset;
8889
8890     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8891
8892     _append_range_to_invlist(invlist, element0, element0);
8893     offset = *get_invlist_offset_addr(invlist);
8894
8895     invlist_set_len(invlist, size, offset);
8896     *other_elements_ptr = invlist_array(invlist) + 1;
8897     return invlist;
8898 }
8899
8900 #endif
8901
8902 PERL_STATIC_INLINE SV*
8903 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8904     return _add_range_to_invlist(invlist, cp, cp);
8905 }
8906
8907 #ifndef PERL_IN_XSUB_RE
8908 void
8909 Perl__invlist_invert(pTHX_ SV* const invlist)
8910 {
8911     /* Complement the input inversion list.  This adds a 0 if the list didn't
8912      * have a zero; removes it otherwise.  As described above, the data
8913      * structure is set up so that this is very efficient */
8914
8915     PERL_ARGS_ASSERT__INVLIST_INVERT;
8916
8917     assert(! invlist_is_iterating(invlist));
8918
8919     /* The inverse of matching nothing is matching everything */
8920     if (_invlist_len(invlist) == 0) {
8921         _append_range_to_invlist(invlist, 0, UV_MAX);
8922         return;
8923     }
8924
8925     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8926 }
8927
8928 #endif
8929
8930 PERL_STATIC_INLINE SV*
8931 S_invlist_clone(pTHX_ SV* const invlist)
8932 {
8933
8934     /* Return a new inversion list that is a copy of the input one, which is
8935      * unchanged.  The new list will not be mortal even if the old one was. */
8936
8937     /* Need to allocate extra space to accommodate Perl's addition of a
8938      * trailing NUL to SvPV's, since it thinks they are always strings */
8939     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8940     STRLEN physical_length = SvCUR(invlist);
8941     bool offset = *(get_invlist_offset_addr(invlist));
8942
8943     PERL_ARGS_ASSERT_INVLIST_CLONE;
8944
8945     *(get_invlist_offset_addr(new_invlist)) = offset;
8946     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8947     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8948
8949     return new_invlist;
8950 }
8951
8952 PERL_STATIC_INLINE STRLEN*
8953 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8954 {
8955     /* Return the address of the UV that contains the current iteration
8956      * position */
8957
8958     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8959
8960     assert(SvTYPE(invlist) == SVt_INVLIST);
8961
8962     return &(((XINVLIST*) SvANY(invlist))->iterator);
8963 }
8964
8965 PERL_STATIC_INLINE void
8966 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8967 {
8968     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8969
8970     *get_invlist_iter_addr(invlist) = 0;
8971 }
8972
8973 PERL_STATIC_INLINE void
8974 S_invlist_iterfinish(pTHX_ SV* invlist)
8975 {
8976     /* Terminate iterator for invlist.  This is to catch development errors.
8977      * Any iteration that is interrupted before completed should call this
8978      * function.  Functions that add code points anywhere else but to the end
8979      * of an inversion list assert that they are not in the middle of an
8980      * iteration.  If they were, the addition would make the iteration
8981      * problematical: if the iteration hadn't reached the place where things
8982      * were being added, it would be ok */
8983
8984     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8985
8986     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8987 }
8988
8989 STATIC bool
8990 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8991 {
8992     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8993      * This call sets in <*start> and <*end>, the next range in <invlist>.
8994      * Returns <TRUE> if successful and the next call will return the next
8995      * range; <FALSE> if was already at the end of the list.  If the latter,
8996      * <*start> and <*end> are unchanged, and the next call to this function
8997      * will start over at the beginning of the list */
8998
8999     STRLEN* pos = get_invlist_iter_addr(invlist);
9000     UV len = _invlist_len(invlist);
9001     UV *array;
9002
9003     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9004
9005     if (*pos >= len) {
9006         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9007         return FALSE;
9008     }
9009
9010     array = invlist_array(invlist);
9011
9012     *start = array[(*pos)++];
9013
9014     if (*pos >= len) {
9015         *end = UV_MAX;
9016     }
9017     else {
9018         *end = array[(*pos)++] - 1;
9019     }
9020
9021     return TRUE;
9022 }
9023
9024 PERL_STATIC_INLINE bool
9025 S_invlist_is_iterating(pTHX_ SV* const invlist)
9026 {
9027     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9028
9029     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9030 }
9031
9032 PERL_STATIC_INLINE UV
9033 S_invlist_highest(pTHX_ SV* const invlist)
9034 {
9035     /* Returns the highest code point that matches an inversion list.  This API
9036      * has an ambiguity, as it returns 0 under either the highest is actually
9037      * 0, or if the list is empty.  If this distinction matters to you, check
9038      * for emptiness before calling this function */
9039
9040     UV len = _invlist_len(invlist);
9041     UV *array;
9042
9043     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9044
9045     if (len == 0) {
9046         return 0;
9047     }
9048
9049     array = invlist_array(invlist);
9050
9051     /* The last element in the array in the inversion list always starts a
9052      * range that goes to infinity.  That range may be for code points that are
9053      * matched in the inversion list, or it may be for ones that aren't
9054      * matched.  In the latter case, the highest code point in the set is one
9055      * less than the beginning of this range; otherwise it is the final element
9056      * of this range: infinity */
9057     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9058            ? UV_MAX
9059            : array[len - 1] - 1;
9060 }
9061
9062 #ifndef PERL_IN_XSUB_RE
9063 SV *
9064 Perl__invlist_contents(pTHX_ SV* const invlist)
9065 {
9066     /* Get the contents of an inversion list into a string SV so that they can
9067      * be printed out.  It uses the format traditionally done for debug tracing
9068      */
9069
9070     UV start, end;
9071     SV* output = newSVpvs("\n");
9072
9073     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9074
9075     assert(! invlist_is_iterating(invlist));
9076
9077     invlist_iterinit(invlist);
9078     while (invlist_iternext(invlist, &start, &end)) {
9079         if (end == UV_MAX) {
9080             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9081         }
9082         else if (end != start) {
9083             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9084                     start,       end);
9085         }
9086         else {
9087             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9088         }
9089     }
9090
9091     return output;
9092 }
9093 #endif
9094
9095 #ifndef PERL_IN_XSUB_RE
9096 void
9097 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9098                          const char * const indent, SV* const invlist)
9099 {
9100     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9101      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9102      * the string 'indent'.  The output looks like this:
9103          [0] 0x000A .. 0x000D
9104          [2] 0x0085
9105          [4] 0x2028 .. 0x2029
9106          [6] 0x3104 .. INFINITY
9107      * This means that the first range of code points matched by the list are
9108      * 0xA through 0xD; the second range contains only the single code point
9109      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9110      * are used to define each range (except if the final range extends to
9111      * infinity, only a single element is needed).  The array index of the
9112      * first element for the corresponding range is given in brackets. */
9113
9114     UV start, end;
9115     STRLEN count = 0;
9116
9117     PERL_ARGS_ASSERT__INVLIST_DUMP;
9118
9119     if (invlist_is_iterating(invlist)) {
9120         Perl_dump_indent(aTHX_ level, file,
9121              "%sCan't dump inversion list because is in middle of iterating\n",
9122              indent);
9123         return;
9124     }
9125
9126     invlist_iterinit(invlist);
9127     while (invlist_iternext(invlist, &start, &end)) {
9128         if (end == UV_MAX) {
9129             Perl_dump_indent(aTHX_ level, file,
9130                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9131                                    indent, (UV)count, start);
9132         }
9133         else if (end != start) {
9134             Perl_dump_indent(aTHX_ level, file,
9135                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9136                                 indent, (UV)count, start,         end);
9137         }
9138         else {
9139             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9140                                             indent, (UV)count, start);
9141         }
9142         count += 2;
9143     }
9144 }
9145
9146 void
9147 Perl__load_PL_utf8_foldclosures (pTHX)
9148 {
9149     assert(! PL_utf8_foldclosures);
9150
9151     /* If the folds haven't been read in, call a fold function
9152      * to force that */
9153     if (! PL_utf8_tofold) {
9154         U8 dummy[UTF8_MAXBYTES_CASE+1];
9155
9156         /* This string is just a short named one above \xff */
9157         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9158         assert(PL_utf8_tofold); /* Verify that worked */
9159     }
9160     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9161 }
9162 #endif
9163
9164 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9165 bool
9166 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9167 {
9168     /* Return a boolean as to if the two passed in inversion lists are
9169      * identical.  The final argument, if TRUE, says to take the complement of
9170      * the second inversion list before doing the comparison */
9171
9172     const UV* array_a = invlist_array(a);
9173     const UV* array_b = invlist_array(b);
9174     UV len_a = _invlist_len(a);
9175     UV len_b = _invlist_len(b);
9176
9177     UV i = 0;               /* current index into the arrays */
9178     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9179
9180     PERL_ARGS_ASSERT__INVLISTEQ;
9181
9182     /* If are to compare 'a' with the complement of b, set it
9183      * up so are looking at b's complement. */
9184     if (complement_b) {
9185
9186         /* The complement of nothing is everything, so <a> would have to have
9187          * just one element, starting at zero (ending at infinity) */
9188         if (len_b == 0) {
9189             return (len_a == 1 && array_a[0] == 0);
9190         }
9191         else if (array_b[0] == 0) {
9192
9193             /* Otherwise, to complement, we invert.  Here, the first element is
9194              * 0, just remove it.  To do this, we just pretend the array starts
9195              * one later */
9196
9197             array_b++;
9198             len_b--;
9199         }
9200         else {
9201
9202             /* But if the first element is not zero, we pretend the list starts
9203              * at the 0 that is always stored immediately before the array. */
9204             array_b--;
9205             len_b++;
9206         }
9207     }
9208
9209     /* Make sure that the lengths are the same, as well as the final element
9210      * before looping through the remainder.  (Thus we test the length, final,
9211      * and first elements right off the bat) */
9212     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9213         retval = FALSE;
9214     }
9215     else for (i = 0; i < len_a - 1; i++) {
9216         if (array_a[i] != array_b[i]) {
9217             retval = FALSE;
9218             break;
9219         }
9220     }
9221
9222     return retval;
9223 }
9224 #endif
9225
9226 #undef HEADER_LENGTH
9227 #undef TO_INTERNAL_SIZE
9228 #undef FROM_INTERNAL_SIZE
9229 #undef INVLIST_VERSION_ID
9230
9231 /* End of inversion list object */
9232
9233 STATIC void
9234 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9235 {
9236     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9237      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9238      * should point to the first flag; it is updated on output to point to the
9239      * final ')' or ':'.  There needs to be at least one flag, or this will
9240      * abort */
9241
9242     /* for (?g), (?gc), and (?o) warnings; warning
9243        about (?c) will warn about (?g) -- japhy    */
9244
9245 #define WASTED_O  0x01
9246 #define WASTED_G  0x02
9247 #define WASTED_C  0x04
9248 #define WASTED_GC (WASTED_G|WASTED_C)
9249     I32 wastedflags = 0x00;
9250     U32 posflags = 0, negflags = 0;
9251     U32 *flagsp = &posflags;
9252     char has_charset_modifier = '\0';
9253     regex_charset cs;
9254     bool has_use_defaults = FALSE;
9255     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9256
9257     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9258
9259     /* '^' as an initial flag sets certain defaults */
9260     if (UCHARAT(RExC_parse) == '^') {
9261         RExC_parse++;
9262         has_use_defaults = TRUE;
9263         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9264         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9265                                         ? REGEX_UNICODE_CHARSET
9266                                         : REGEX_DEPENDS_CHARSET);
9267     }
9268
9269     cs = get_regex_charset(RExC_flags);
9270     if (cs == REGEX_DEPENDS_CHARSET
9271         && (RExC_utf8 || RExC_uni_semantics))
9272     {
9273         cs = REGEX_UNICODE_CHARSET;
9274     }
9275
9276     while (*RExC_parse) {
9277         /* && strchr("iogcmsx", *RExC_parse) */
9278         /* (?g), (?gc) and (?o) are useless here
9279            and must be globally applied -- japhy */
9280         switch (*RExC_parse) {
9281
9282             /* Code for the imsx flags */
9283             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9284
9285             case LOCALE_PAT_MOD:
9286                 if (has_charset_modifier) {
9287                     goto excess_modifier;
9288                 }
9289                 else if (flagsp == &negflags) {
9290                     goto neg_modifier;
9291                 }
9292                 cs = REGEX_LOCALE_CHARSET;
9293                 has_charset_modifier = LOCALE_PAT_MOD;
9294                 break;
9295             case UNICODE_PAT_MOD:
9296                 if (has_charset_modifier) {
9297                     goto excess_modifier;
9298                 }
9299                 else if (flagsp == &negflags) {
9300                     goto neg_modifier;
9301                 }
9302                 cs = REGEX_UNICODE_CHARSET;
9303                 has_charset_modifier = UNICODE_PAT_MOD;
9304                 break;
9305             case ASCII_RESTRICT_PAT_MOD:
9306                 if (flagsp == &negflags) {
9307                     goto neg_modifier;
9308                 }
9309                 if (has_charset_modifier) {
9310                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9311                         goto excess_modifier;
9312                     }
9313                     /* Doubled modifier implies more restricted */
9314                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9315                 }
9316                 else {
9317                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9318                 }
9319                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9320                 break;
9321             case DEPENDS_PAT_MOD:
9322                 if (has_use_defaults) {
9323                     goto fail_modifiers;
9324                 }
9325                 else if (flagsp == &negflags) {
9326                     goto neg_modifier;
9327                 }
9328                 else if (has_charset_modifier) {
9329                     goto excess_modifier;
9330                 }
9331
9332                 /* The dual charset means unicode semantics if the
9333                  * pattern (or target, not known until runtime) are
9334                  * utf8, or something in the pattern indicates unicode
9335                  * semantics */
9336                 cs = (RExC_utf8 || RExC_uni_semantics)
9337                      ? REGEX_UNICODE_CHARSET
9338                      : REGEX_DEPENDS_CHARSET;
9339                 has_charset_modifier = DEPENDS_PAT_MOD;
9340                 break;
9341             excess_modifier:
9342                 RExC_parse++;
9343                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9344                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9345                 }
9346                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9347                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9348                                         *(RExC_parse - 1));
9349                 }
9350                 else {
9351                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9352                 }
9353                 /*NOTREACHED*/
9354             neg_modifier:
9355                 RExC_parse++;
9356                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9357                                     *(RExC_parse - 1));
9358                 /*NOTREACHED*/
9359             case ONCE_PAT_MOD: /* 'o' */
9360             case GLOBAL_PAT_MOD: /* 'g' */
9361                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9362                     const I32 wflagbit = *RExC_parse == 'o'
9363                                          ? WASTED_O
9364                                          : WASTED_G;
9365                     if (! (wastedflags & wflagbit) ) {
9366                         wastedflags |= wflagbit;
9367                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9368                         vWARN5(
9369                             RExC_parse + 1,
9370                             "Useless (%s%c) - %suse /%c modifier",
9371                             flagsp == &negflags ? "?-" : "?",
9372                             *RExC_parse,
9373                             flagsp == &negflags ? "don't " : "",
9374                             *RExC_parse
9375                         );
9376                     }
9377                 }
9378                 break;
9379
9380             case CONTINUE_PAT_MOD: /* 'c' */
9381                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9382                     if (! (wastedflags & WASTED_C) ) {
9383                         wastedflags |= WASTED_GC;
9384                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9385                         vWARN3(
9386                             RExC_parse + 1,
9387                             "Useless (%sc) - %suse /gc modifier",
9388                             flagsp == &negflags ? "?-" : "?",
9389                             flagsp == &negflags ? "don't " : ""
9390                         );
9391                     }
9392                 }
9393                 break;
9394             case KEEPCOPY_PAT_MOD: /* 'p' */
9395                 if (flagsp == &negflags) {
9396                     if (SIZE_ONLY)
9397                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9398                 } else {
9399                     *flagsp |= RXf_PMf_KEEPCOPY;
9400                 }
9401                 break;
9402             case '-':
9403                 /* A flag is a default iff it is following a minus, so
9404                  * if there is a minus, it means will be trying to
9405                  * re-specify a default which is an error */
9406                 if (has_use_defaults || flagsp == &negflags) {
9407                     goto fail_modifiers;
9408                 }
9409                 flagsp = &negflags;
9410                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9411                 break;
9412             case ':':
9413             case ')':
9414                 RExC_flags |= posflags;
9415                 RExC_flags &= ~negflags;
9416                 set_regex_charset(&RExC_flags, cs);
9417                 if (RExC_flags & RXf_PMf_FOLD) {
9418                     RExC_contains_i = 1;
9419                 }
9420                 return;
9421                 /*NOTREACHED*/
9422             default:
9423             fail_modifiers:
9424                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9425                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9426                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9427                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9428                 /*NOTREACHED*/
9429         }
9430
9431         ++RExC_parse;
9432     }
9433 }
9434
9435 /*
9436  - reg - regular expression, i.e. main body or parenthesized thing
9437  *
9438  * Caller must absorb opening parenthesis.
9439  *
9440  * Combining parenthesis handling with the base level of regular expression
9441  * is a trifle forced, but the need to tie the tails of the branches to what
9442  * follows makes it hard to avoid.
9443  */
9444 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9445 #ifdef DEBUGGING
9446 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9447 #else
9448 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9449 #endif
9450
9451 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9452    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9453    needs to be restarted.
9454    Otherwise would only return NULL if regbranch() returns NULL, which
9455    cannot happen.  */
9456 STATIC regnode *
9457 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9458     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9459      * 2 is like 1, but indicates that nextchar() has been called to advance
9460      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9461      * this flag alerts us to the need to check for that */
9462 {
9463     dVAR;
9464     regnode *ret;               /* Will be the head of the group. */
9465     regnode *br;
9466     regnode *lastbr;
9467     regnode *ender = NULL;
9468     I32 parno = 0;
9469     I32 flags;
9470     U32 oregflags = RExC_flags;
9471     bool have_branch = 0;
9472     bool is_open = 0;
9473     I32 freeze_paren = 0;
9474     I32 after_freeze = 0;
9475
9476     char * parse_start = RExC_parse; /* MJD */
9477     char * const oregcomp_parse = RExC_parse;
9478
9479     GET_RE_DEBUG_FLAGS_DECL;
9480
9481     PERL_ARGS_ASSERT_REG;
9482     DEBUG_PARSE("reg ");
9483
9484     *flagp = 0;                         /* Tentatively. */
9485
9486
9487     /* Make an OPEN node, if parenthesized. */
9488     if (paren) {
9489
9490         /* Under /x, space and comments can be gobbled up between the '(' and
9491          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9492          * intervening space, as the sequence is a token, and a token should be
9493          * indivisible */
9494         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9495
9496         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9497             char *start_verb = RExC_parse;
9498             STRLEN verb_len = 0;
9499             char *start_arg = NULL;
9500             unsigned char op = 0;
9501             int argok = 1;
9502             int internal_argval = 0; /* internal_argval is only useful if
9503                                         !argok */
9504
9505             if (has_intervening_patws) {
9506                 RExC_parse++;
9507                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9508             }
9509             while ( *RExC_parse && *RExC_parse != ')' ) {
9510                 if ( *RExC_parse == ':' ) {
9511                     start_arg = RExC_parse + 1;
9512                     break;
9513                 }
9514                 RExC_parse++;
9515             }
9516             ++start_verb;
9517             verb_len = RExC_parse - start_verb;
9518             if ( start_arg ) {
9519                 RExC_parse++;
9520                 while ( *RExC_parse && *RExC_parse != ')' )
9521                     RExC_parse++;
9522                 if ( *RExC_parse != ')' )
9523                     vFAIL("Unterminated verb pattern argument");
9524                 if ( RExC_parse == start_arg )
9525                     start_arg = NULL;
9526             } else {
9527                 if ( *RExC_parse != ')' )
9528                     vFAIL("Unterminated verb pattern");
9529             }
9530
9531             switch ( *start_verb ) {
9532             case 'A':  /* (*ACCEPT) */
9533                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9534                     op = ACCEPT;
9535                     internal_argval = RExC_nestroot;
9536                 }
9537                 break;
9538             case 'C':  /* (*COMMIT) */
9539                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9540                     op = COMMIT;
9541                 break;
9542             case 'F':  /* (*FAIL) */
9543                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9544                     op = OPFAIL;
9545                     argok = 0;
9546                 }
9547                 break;
9548             case ':':  /* (*:NAME) */
9549             case 'M':  /* (*MARK:NAME) */
9550                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9551                     op = MARKPOINT;
9552                     argok = -1;
9553                 }
9554                 break;
9555             case 'P':  /* (*PRUNE) */
9556                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9557                     op = PRUNE;
9558                 break;
9559             case 'S':   /* (*SKIP) */
9560                 if ( memEQs(start_verb,verb_len,"SKIP") )
9561                     op = SKIP;
9562                 break;
9563             case 'T':  /* (*THEN) */
9564                 /* [19:06] <TimToady> :: is then */
9565                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9566                     op = CUTGROUP;
9567                     RExC_seen |= REG_CUTGROUP_SEEN;
9568                 }
9569                 break;
9570             }
9571             if ( ! op ) {
9572                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9573                 vFAIL2utf8f(
9574                     "Unknown verb pattern '%"UTF8f"'",
9575                     UTF8fARG(UTF, verb_len, start_verb));
9576             }
9577             if ( argok ) {
9578                 if ( start_arg && internal_argval ) {
9579                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9580                         verb_len, start_verb);
9581                 } else if ( argok < 0 && !start_arg ) {
9582                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9583                         verb_len, start_verb);
9584                 } else {
9585                     ret = reganode(pRExC_state, op, internal_argval);
9586                     if ( ! internal_argval && ! SIZE_ONLY ) {
9587                         if (start_arg) {
9588                             SV *sv = newSVpvn( start_arg,
9589                                                RExC_parse - start_arg);
9590                             ARG(ret) = add_data( pRExC_state,
9591                                                  STR_WITH_LEN("S"));
9592                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9593                             ret->flags = 0;
9594                         } else {
9595                             ret->flags = 1;
9596                         }
9597                     }
9598                 }
9599                 if (!internal_argval)
9600                     RExC_seen |= REG_VERBARG_SEEN;
9601             } else if ( start_arg ) {
9602                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9603                         verb_len, start_verb);
9604             } else {
9605                 ret = reg_node(pRExC_state, op);
9606             }
9607             nextchar(pRExC_state);
9608             return ret;
9609         }
9610         else if (*RExC_parse == '?') { /* (?...) */
9611             bool is_logical = 0;
9612             const char * const seqstart = RExC_parse;
9613             if (has_intervening_patws) {
9614                 RExC_parse++;
9615                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9616             }
9617
9618             RExC_parse++;
9619             paren = *RExC_parse++;
9620             ret = NULL;                 /* For look-ahead/behind. */
9621             switch (paren) {
9622
9623             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9624                 paren = *RExC_parse++;
9625                 if ( paren == '<')         /* (?P<...>) named capture */
9626                     goto named_capture;
9627                 else if (paren == '>') {   /* (?P>name) named recursion */
9628                     goto named_recursion;
9629                 }
9630                 else if (paren == '=') {   /* (?P=...)  named backref */
9631                     /* this pretty much dupes the code for \k<NAME> in
9632                      * regatom(), if you change this make sure you change that
9633                      * */
9634                     char* name_start = RExC_parse;
9635                     U32 num = 0;
9636                     SV *sv_dat = reg_scan_name(pRExC_state,
9637                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9638                     if (RExC_parse == name_start || *RExC_parse != ')')
9639                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9640                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9641
9642                     if (!SIZE_ONLY) {
9643                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9644                         RExC_rxi->data->data[num]=(void*)sv_dat;
9645                         SvREFCNT_inc_simple_void(sv_dat);
9646                     }
9647                     RExC_sawback = 1;
9648                     ret = reganode(pRExC_state,
9649                                    ((! FOLD)
9650                                      ? NREF
9651                                      : (ASCII_FOLD_RESTRICTED)
9652                                        ? NREFFA
9653                                        : (AT_LEAST_UNI_SEMANTICS)
9654                                          ? NREFFU
9655                                          : (LOC)
9656                                            ? NREFFL
9657                                            : NREFF),
9658                                     num);
9659                     *flagp |= HASWIDTH;
9660
9661                     Set_Node_Offset(ret, parse_start+1);
9662                     Set_Node_Cur_Length(ret, parse_start);
9663
9664                     nextchar(pRExC_state);
9665                     return ret;
9666                 }
9667                 RExC_parse++;
9668                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9669                 vFAIL3("Sequence (%.*s...) not recognized",
9670                                 RExC_parse-seqstart, seqstart);
9671                 /*NOTREACHED*/
9672             case '<':           /* (?<...) */
9673                 if (*RExC_parse == '!')
9674                     paren = ',';
9675                 else if (*RExC_parse != '=')
9676               named_capture:
9677                 {               /* (?<...>) */
9678                     char *name_start;
9679                     SV *svname;
9680                     paren= '>';
9681             case '\'':          /* (?'...') */
9682                     name_start= RExC_parse;
9683                     svname = reg_scan_name(pRExC_state,
9684                         SIZE_ONLY    /* reverse test from the others */
9685                         ? REG_RSN_RETURN_NAME
9686                         : REG_RSN_RETURN_NULL);
9687                     if (RExC_parse == name_start || *RExC_parse != paren)
9688                         vFAIL2("Sequence (?%c... not terminated",
9689                             paren=='>' ? '<' : paren);
9690                     if (SIZE_ONLY) {
9691                         HE *he_str;
9692                         SV *sv_dat = NULL;
9693                         if (!svname) /* shouldn't happen */
9694                             Perl_croak(aTHX_
9695                                 "panic: reg_scan_name returned NULL");
9696                         if (!RExC_paren_names) {
9697                             RExC_paren_names= newHV();
9698                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9699 #ifdef DEBUGGING
9700                             RExC_paren_name_list= newAV();
9701                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9702 #endif
9703                         }
9704                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9705                         if ( he_str )
9706                             sv_dat = HeVAL(he_str);
9707                         if ( ! sv_dat ) {
9708                             /* croak baby croak */
9709                             Perl_croak(aTHX_
9710                                 "panic: paren_name hash element allocation failed");
9711                         } else if ( SvPOK(sv_dat) ) {
9712                             /* (?|...) can mean we have dupes so scan to check
9713                                its already been stored. Maybe a flag indicating
9714                                we are inside such a construct would be useful,
9715                                but the arrays are likely to be quite small, so
9716                                for now we punt -- dmq */
9717                             IV count = SvIV(sv_dat);
9718                             I32 *pv = (I32*)SvPVX(sv_dat);
9719                             IV i;
9720                             for ( i = 0 ; i < count ; i++ ) {
9721                                 if ( pv[i] == RExC_npar ) {
9722                                     count = 0;
9723                                     break;
9724                                 }
9725                             }
9726                             if ( count ) {
9727                                 pv = (I32*)SvGROW(sv_dat,
9728                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9729                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9730                                 pv[count] = RExC_npar;
9731                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9732                             }
9733                         } else {
9734                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9735                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9736                                                                 sizeof(I32));
9737                             SvIOK_on(sv_dat);
9738                             SvIV_set(sv_dat, 1);
9739                         }
9740 #ifdef DEBUGGING
9741                         /* Yes this does cause a memory leak in debugging Perls
9742                          * */
9743                         if (!av_store(RExC_paren_name_list,
9744                                       RExC_npar, SvREFCNT_inc(svname)))
9745                             SvREFCNT_dec_NN(svname);
9746 #endif
9747
9748                         /*sv_dump(sv_dat);*/
9749                     }
9750                     nextchar(pRExC_state);
9751                     paren = 1;
9752                     goto capturing_parens;
9753                 }
9754                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9755                 RExC_in_lookbehind++;
9756                 RExC_parse++;
9757                 /* FALLTHROUGH */
9758             case '=':           /* (?=...) */
9759                 RExC_seen_zerolen++;
9760                 break;
9761             case '!':           /* (?!...) */
9762                 RExC_seen_zerolen++;
9763                 if (*RExC_parse == ')') {
9764                     ret=reg_node(pRExC_state, OPFAIL);
9765                     nextchar(pRExC_state);
9766                     return ret;
9767                 }
9768                 break;
9769             case '|':           /* (?|...) */
9770                 /* branch reset, behave like a (?:...) except that
9771                    buffers in alternations share the same numbers */
9772                 paren = ':';
9773                 after_freeze = freeze_paren = RExC_npar;
9774                 break;
9775             case ':':           /* (?:...) */
9776             case '>':           /* (?>...) */
9777                 break;
9778             case '$':           /* (?$...) */
9779             case '@':           /* (?@...) */
9780                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9781                 break;
9782             case '0' :           /* (?0) */
9783             case 'R' :           /* (?R) */
9784                 if (*RExC_parse != ')')
9785                     FAIL("Sequence (?R) not terminated");
9786                 ret = reg_node(pRExC_state, GOSTART);
9787                     RExC_seen |= REG_GOSTART_SEEN;
9788                 *flagp |= POSTPONED;
9789                 nextchar(pRExC_state);
9790                 return ret;
9791                 /*notreached*/
9792             { /* named and numeric backreferences */
9793                 I32 num;
9794             case '&':            /* (?&NAME) */
9795                 parse_start = RExC_parse - 1;
9796               named_recursion:
9797                 {
9798                     SV *sv_dat = reg_scan_name(pRExC_state,
9799                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9800                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9801                 }
9802                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9803                     vFAIL("Sequence (?&... not terminated");
9804                 goto gen_recurse_regop;
9805                 assert(0); /* NOT REACHED */
9806             case '+':
9807                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9808                     RExC_parse++;
9809                     vFAIL("Illegal pattern");
9810                 }
9811                 goto parse_recursion;
9812                 /* NOT REACHED*/
9813             case '-': /* (?-1) */
9814                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9815                     RExC_parse--; /* rewind to let it be handled later */
9816                     goto parse_flags;
9817                 }
9818                 /* FALLTHROUGH */
9819             case '1': case '2': case '3': case '4': /* (?1) */
9820             case '5': case '6': case '7': case '8': case '9':
9821                 RExC_parse--;
9822               parse_recursion:
9823                 num = atoi(RExC_parse);
9824                 parse_start = RExC_parse - 1; /* MJD */
9825                 if (*RExC_parse == '-')
9826                     RExC_parse++;
9827                 while (isDIGIT(*RExC_parse))
9828                         RExC_parse++;
9829                 if (*RExC_parse!=')')
9830                     vFAIL("Expecting close bracket");
9831
9832               gen_recurse_regop:
9833                 if ( paren == '-' ) {
9834                     /*
9835                     Diagram of capture buffer numbering.
9836                     Top line is the normal capture buffer numbers
9837                     Bottom line is the negative indexing as from
9838                     the X (the (?-2))
9839
9840                     +   1 2    3 4 5 X          6 7
9841                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9842                     -   5 4    3 2 1 X          x x
9843
9844                     */
9845                     num = RExC_npar + num;
9846                     if (num < 1)  {
9847                         RExC_parse++;
9848                         vFAIL("Reference to nonexistent group");
9849                     }
9850                 } else if ( paren == '+' ) {
9851                     num = RExC_npar + num - 1;
9852                 }
9853
9854                 ret = reganode(pRExC_state, GOSUB, num);
9855                 if (!SIZE_ONLY) {
9856                     if (num > (I32)RExC_rx->nparens) {
9857                         RExC_parse++;
9858                         vFAIL("Reference to nonexistent group");
9859                     }
9860                     ARG2L_SET( ret, RExC_recurse_count++);
9861                     RExC_emit++;
9862                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9863                         "Recurse #%"UVuf" to %"IVdf"\n",
9864                               (UV)ARG(ret), (IV)ARG2L(ret)));
9865                 } else {
9866                     RExC_size++;
9867                 }
9868                     RExC_seen |= REG_RECURSE_SEEN;
9869                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9870                 Set_Node_Offset(ret, parse_start); /* MJD */
9871
9872                 *flagp |= POSTPONED;
9873                 nextchar(pRExC_state);
9874                 return ret;
9875             } /* named and numeric backreferences */
9876             assert(0); /* NOT REACHED */
9877
9878             case '?':           /* (??...) */
9879                 is_logical = 1;
9880                 if (*RExC_parse != '{') {
9881                     RExC_parse++;
9882                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9883                     vFAIL2utf8f(
9884                         "Sequence (%"UTF8f"...) not recognized",
9885                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9886                     /*NOTREACHED*/
9887                 }
9888                 *flagp |= POSTPONED;
9889                 paren = *RExC_parse++;
9890                 /* FALLTHROUGH */
9891             case '{':           /* (?{...}) */
9892             {
9893                 U32 n = 0;
9894                 struct reg_code_block *cb;
9895
9896                 RExC_seen_zerolen++;
9897
9898                 if (   !pRExC_state->num_code_blocks
9899                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9900                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9901                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9902                             - RExC_start)
9903                 ) {
9904                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9905                         FAIL("panic: Sequence (?{...}): no code block found\n");
9906                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9907                 }
9908                 /* this is a pre-compiled code block (?{...}) */
9909                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9910                 RExC_parse = RExC_start + cb->end;
9911                 if (!SIZE_ONLY) {
9912                     OP *o = cb->block;
9913                     if (cb->src_regex) {
9914                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9915                         RExC_rxi->data->data[n] =
9916                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9917                         RExC_rxi->data->data[n+1] = (void*)o;
9918                     }
9919                     else {
9920                         n = add_data(pRExC_state,
9921                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9922                         RExC_rxi->data->data[n] = (void*)o;
9923                     }
9924                 }
9925                 pRExC_state->code_index++;
9926                 nextchar(pRExC_state);
9927
9928                 if (is_logical) {
9929                     regnode *eval;
9930                     ret = reg_node(pRExC_state, LOGICAL);
9931                     eval = reganode(pRExC_state, EVAL, n);
9932                     if (!SIZE_ONLY) {
9933                         ret->flags = 2;
9934                         /* for later propagation into (??{}) return value */
9935                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9936                     }
9937                     REGTAIL(pRExC_state, ret, eval);
9938                     /* deal with the length of this later - MJD */
9939                     return ret;
9940                 }
9941                 ret = reganode(pRExC_state, EVAL, n);
9942                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9943                 Set_Node_Offset(ret, parse_start);
9944                 return ret;
9945             }
9946             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9947             {
9948                 int is_define= 0;
9949                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9950                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9951                         || RExC_parse[1] == '<'
9952                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9953                         I32 flag;
9954                         regnode *tail;
9955
9956                         ret = reg_node(pRExC_state, LOGICAL);
9957                         if (!SIZE_ONLY)
9958                             ret->flags = 1;
9959
9960                         tail = reg(pRExC_state, 1, &flag, depth+1);
9961                         if (flag & RESTART_UTF8) {
9962                             *flagp = RESTART_UTF8;
9963                             return NULL;
9964                         }
9965                         REGTAIL(pRExC_state, ret, tail);
9966                         goto insert_if;
9967                     }
9968                 }
9969                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9970                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9971                 {
9972                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9973                     char *name_start= RExC_parse++;
9974                     U32 num = 0;
9975                     SV *sv_dat=reg_scan_name(pRExC_state,
9976                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9977                     if (RExC_parse == name_start || *RExC_parse != ch)
9978                         vFAIL2("Sequence (?(%c... not terminated",
9979                             (ch == '>' ? '<' : ch));
9980                     RExC_parse++;
9981                     if (!SIZE_ONLY) {
9982                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9983                         RExC_rxi->data->data[num]=(void*)sv_dat;
9984                         SvREFCNT_inc_simple_void(sv_dat);
9985                     }
9986                     ret = reganode(pRExC_state,NGROUPP,num);
9987                     goto insert_if_check_paren;
9988                 }
9989                 else if (RExC_parse[0] == 'D' &&
9990                          RExC_parse[1] == 'E' &&
9991                          RExC_parse[2] == 'F' &&
9992                          RExC_parse[3] == 'I' &&
9993                          RExC_parse[4] == 'N' &&
9994                          RExC_parse[5] == 'E')
9995                 {
9996                     ret = reganode(pRExC_state,DEFINEP,0);
9997                     RExC_parse +=6 ;
9998                     is_define = 1;
9999                     goto insert_if_check_paren;
10000                 }
10001                 else if (RExC_parse[0] == 'R') {
10002                     RExC_parse++;
10003                     parno = 0;
10004                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10005                         parno = atoi(RExC_parse++);
10006                         while (isDIGIT(*RExC_parse))
10007                             RExC_parse++;
10008                     } else if (RExC_parse[0] == '&') {
10009                         SV *sv_dat;
10010                         RExC_parse++;
10011                         sv_dat = reg_scan_name(pRExC_state,
10012                             SIZE_ONLY
10013                             ? REG_RSN_RETURN_NULL
10014                             : REG_RSN_RETURN_DATA);
10015                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10016                     }
10017                     ret = reganode(pRExC_state,INSUBP,parno);
10018                     goto insert_if_check_paren;
10019                 }
10020                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10021                     /* (?(1)...) */
10022                     char c;
10023                     char *tmp;
10024                     parno = atoi(RExC_parse++);
10025
10026                     while (isDIGIT(*RExC_parse))
10027                         RExC_parse++;
10028                     ret = reganode(pRExC_state, GROUPP, parno);
10029
10030                  insert_if_check_paren:
10031                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10032                         /* nextchar also skips comments, so undo its work
10033                          * and skip over the the next character.
10034                          */
10035                         RExC_parse = tmp;
10036                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10037                         vFAIL("Switch condition not recognized");
10038                     }
10039                   insert_if:
10040                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10041                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10042                     if (br == NULL) {
10043                         if (flags & RESTART_UTF8) {
10044                             *flagp = RESTART_UTF8;
10045                             return NULL;
10046                         }
10047                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10048                               (UV) flags);
10049                     } else
10050                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10051                                                           LONGJMP, 0));
10052                     c = *nextchar(pRExC_state);
10053                     if (flags&HASWIDTH)
10054                         *flagp |= HASWIDTH;
10055                     if (c == '|') {
10056                         if (is_define)
10057                             vFAIL("(?(DEFINE)....) does not allow branches");
10058
10059                         /* Fake one for optimizer.  */
10060                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10061
10062                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10063                             if (flags & RESTART_UTF8) {
10064                                 *flagp = RESTART_UTF8;
10065                                 return NULL;
10066                             }
10067                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10068                                   (UV) flags);
10069                         }
10070                         REGTAIL(pRExC_state, ret, lastbr);
10071                         if (flags&HASWIDTH)
10072                             *flagp |= HASWIDTH;
10073                         c = *nextchar(pRExC_state);
10074                     }
10075                     else
10076                         lastbr = NULL;
10077                     if (c != ')')
10078                         vFAIL("Switch (?(condition)... contains too many branches");
10079                     ender = reg_node(pRExC_state, TAIL);
10080                     REGTAIL(pRExC_state, br, ender);
10081                     if (lastbr) {
10082                         REGTAIL(pRExC_state, lastbr, ender);
10083                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10084                     }
10085                     else
10086                         REGTAIL(pRExC_state, ret, ender);
10087                     RExC_size++; /* XXX WHY do we need this?!!
10088                                     For large programs it seems to be required
10089                                     but I can't figure out why. -- dmq*/
10090                     return ret;
10091                 }
10092                 else {
10093                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10094                     vFAIL("Unknown switch condition (?(...))");
10095                 }
10096             }
10097             case '[':           /* (?[ ... ]) */
10098                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10099                                          oregcomp_parse);
10100             case 0:
10101                 RExC_parse--; /* for vFAIL to print correctly */
10102                 vFAIL("Sequence (? incomplete");
10103                 break;
10104             default: /* e.g., (?i) */
10105                 --RExC_parse;
10106               parse_flags:
10107                 parse_lparen_question_flags(pRExC_state);
10108                 if (UCHARAT(RExC_parse) != ':') {
10109                     nextchar(pRExC_state);
10110                     *flagp = TRYAGAIN;
10111                     return NULL;
10112                 }
10113                 paren = ':';
10114                 nextchar(pRExC_state);
10115                 ret = NULL;
10116                 goto parse_rest;
10117             } /* end switch */
10118         }
10119         else {                  /* (...) */
10120           capturing_parens:
10121             parno = RExC_npar;
10122             RExC_npar++;
10123
10124             ret = reganode(pRExC_state, OPEN, parno);
10125             if (!SIZE_ONLY ){
10126                 if (!RExC_nestroot)
10127                     RExC_nestroot = parno;
10128                 if (RExC_seen & REG_RECURSE_SEEN
10129                     && !RExC_open_parens[parno-1])
10130                 {
10131                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10132                         "Setting open paren #%"IVdf" to %d\n",
10133                         (IV)parno, REG_NODE_NUM(ret)));
10134                     RExC_open_parens[parno-1]= ret;
10135                 }
10136             }
10137             Set_Node_Length(ret, 1); /* MJD */
10138             Set_Node_Offset(ret, RExC_parse); /* MJD */
10139             is_open = 1;
10140         }
10141     }
10142     else                        /* ! paren */
10143         ret = NULL;
10144
10145    parse_rest:
10146     /* Pick up the branches, linking them together. */
10147     parse_start = RExC_parse;   /* MJD */
10148     br = regbranch(pRExC_state, &flags, 1,depth+1);
10149
10150     /*     branch_len = (paren != 0); */
10151
10152     if (br == NULL) {
10153         if (flags & RESTART_UTF8) {
10154             *flagp = RESTART_UTF8;
10155             return NULL;
10156         }
10157         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10158     }
10159     if (*RExC_parse == '|') {
10160         if (!SIZE_ONLY && RExC_extralen) {
10161             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10162         }
10163         else {                  /* MJD */
10164             reginsert(pRExC_state, BRANCH, br, depth+1);
10165             Set_Node_Length(br, paren != 0);
10166             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10167         }
10168         have_branch = 1;
10169         if (SIZE_ONLY)
10170             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10171     }
10172     else if (paren == ':') {
10173         *flagp |= flags&SIMPLE;
10174     }
10175     if (is_open) {                              /* Starts with OPEN. */
10176         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10177     }
10178     else if (paren != '?')              /* Not Conditional */
10179         ret = br;
10180     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10181     lastbr = br;
10182     while (*RExC_parse == '|') {
10183         if (!SIZE_ONLY && RExC_extralen) {
10184             ender = reganode(pRExC_state, LONGJMP,0);
10185
10186             /* Append to the previous. */
10187             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10188         }
10189         if (SIZE_ONLY)
10190             RExC_extralen += 2;         /* Account for LONGJMP. */
10191         nextchar(pRExC_state);
10192         if (freeze_paren) {
10193             if (RExC_npar > after_freeze)
10194                 after_freeze = RExC_npar;
10195             RExC_npar = freeze_paren;
10196         }
10197         br = regbranch(pRExC_state, &flags, 0, depth+1);
10198
10199         if (br == NULL) {
10200             if (flags & RESTART_UTF8) {
10201                 *flagp = RESTART_UTF8;
10202                 return NULL;
10203             }
10204             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10205         }
10206         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10207         lastbr = br;
10208         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10209     }
10210
10211     if (have_branch || paren != ':') {
10212         /* Make a closing node, and hook it on the end. */
10213         switch (paren) {
10214         case ':':
10215             ender = reg_node(pRExC_state, TAIL);
10216             break;
10217         case 1: case 2:
10218             ender = reganode(pRExC_state, CLOSE, parno);
10219             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10220                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10221                         "Setting close paren #%"IVdf" to %d\n",
10222                         (IV)parno, REG_NODE_NUM(ender)));
10223                 RExC_close_parens[parno-1]= ender;
10224                 if (RExC_nestroot == parno)
10225                     RExC_nestroot = 0;
10226             }
10227             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10228             Set_Node_Length(ender,1); /* MJD */
10229             break;
10230         case '<':
10231         case ',':
10232         case '=':
10233         case '!':
10234             *flagp &= ~HASWIDTH;
10235             /* FALLTHROUGH */
10236         case '>':
10237             ender = reg_node(pRExC_state, SUCCEED);
10238             break;
10239         case 0:
10240             ender = reg_node(pRExC_state, END);
10241             if (!SIZE_ONLY) {
10242                 assert(!RExC_opend); /* there can only be one! */
10243                 RExC_opend = ender;
10244             }
10245             break;
10246         }
10247         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10248             SV * const mysv_val1=sv_newmortal();
10249             SV * const mysv_val2=sv_newmortal();
10250             DEBUG_PARSE_MSG("lsbr");
10251             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10252             regprop(RExC_rx, mysv_val2, ender, NULL);
10253             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10254                           SvPV_nolen_const(mysv_val1),
10255                           (IV)REG_NODE_NUM(lastbr),
10256                           SvPV_nolen_const(mysv_val2),
10257                           (IV)REG_NODE_NUM(ender),
10258                           (IV)(ender - lastbr)
10259             );
10260         });
10261         REGTAIL(pRExC_state, lastbr, ender);
10262
10263         if (have_branch && !SIZE_ONLY) {
10264             char is_nothing= 1;
10265             if (depth==1)
10266                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10267
10268             /* Hook the tails of the branches to the closing node. */
10269             for (br = ret; br; br = regnext(br)) {
10270                 const U8 op = PL_regkind[OP(br)];
10271                 if (op == BRANCH) {
10272                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10273                     if ( OP(NEXTOPER(br)) != NOTHING
10274                          || regnext(NEXTOPER(br)) != ender)
10275                         is_nothing= 0;
10276                 }
10277                 else if (op == BRANCHJ) {
10278                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10279                     /* for now we always disable this optimisation * /
10280                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10281                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10282                     */
10283                         is_nothing= 0;
10284                 }
10285             }
10286             if (is_nothing) {
10287                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10288                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10289                     SV * const mysv_val1=sv_newmortal();
10290                     SV * const mysv_val2=sv_newmortal();
10291                     DEBUG_PARSE_MSG("NADA");
10292                     regprop(RExC_rx, mysv_val1, ret, NULL);
10293                     regprop(RExC_rx, mysv_val2, ender, NULL);
10294                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10295                                   SvPV_nolen_const(mysv_val1),
10296                                   (IV)REG_NODE_NUM(ret),
10297                                   SvPV_nolen_const(mysv_val2),
10298                                   (IV)REG_NODE_NUM(ender),
10299                                   (IV)(ender - ret)
10300                     );
10301                 });
10302                 OP(br)= NOTHING;
10303                 if (OP(ender) == TAIL) {
10304                     NEXT_OFF(br)= 0;
10305                     RExC_emit= br + 1;
10306                 } else {
10307                     regnode *opt;
10308                     for ( opt= br + 1; opt < ender ; opt++ )
10309                         OP(opt)= OPTIMIZED;
10310                     NEXT_OFF(br)= ender - br;
10311                 }
10312             }
10313         }
10314     }
10315
10316     {
10317         const char *p;
10318         static const char parens[] = "=!<,>";
10319
10320         if (paren && (p = strchr(parens, paren))) {
10321             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10322             int flag = (p - parens) > 1;
10323
10324             if (paren == '>')
10325                 node = SUSPEND, flag = 0;
10326             reginsert(pRExC_state, node,ret, depth+1);
10327             Set_Node_Cur_Length(ret, parse_start);
10328             Set_Node_Offset(ret, parse_start + 1);
10329             ret->flags = flag;
10330             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10331         }
10332     }
10333
10334     /* Check for proper termination. */
10335     if (paren) {
10336         /* restore original flags, but keep (?p) */
10337         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10338         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10339             RExC_parse = oregcomp_parse;
10340             vFAIL("Unmatched (");
10341         }
10342     }
10343     else if (!paren && RExC_parse < RExC_end) {
10344         if (*RExC_parse == ')') {
10345             RExC_parse++;
10346             vFAIL("Unmatched )");
10347         }
10348         else
10349             FAIL("Junk on end of regexp");      /* "Can't happen". */
10350         assert(0); /* NOTREACHED */
10351     }
10352
10353     if (RExC_in_lookbehind) {
10354         RExC_in_lookbehind--;
10355     }
10356     if (after_freeze > RExC_npar)
10357         RExC_npar = after_freeze;
10358     return(ret);
10359 }
10360
10361 /*
10362  - regbranch - one alternative of an | operator
10363  *
10364  * Implements the concatenation operator.
10365  *
10366  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10367  * restarted.
10368  */
10369 STATIC regnode *
10370 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10371 {
10372     dVAR;
10373     regnode *ret;
10374     regnode *chain = NULL;
10375     regnode *latest;
10376     I32 flags = 0, c = 0;
10377     GET_RE_DEBUG_FLAGS_DECL;
10378
10379     PERL_ARGS_ASSERT_REGBRANCH;
10380
10381     DEBUG_PARSE("brnc");
10382
10383     if (first)
10384         ret = NULL;
10385     else {
10386         if (!SIZE_ONLY && RExC_extralen)
10387             ret = reganode(pRExC_state, BRANCHJ,0);
10388         else {
10389             ret = reg_node(pRExC_state, BRANCH);
10390             Set_Node_Length(ret, 1);
10391         }
10392     }
10393
10394     if (!first && SIZE_ONLY)
10395         RExC_extralen += 1;                     /* BRANCHJ */
10396
10397     *flagp = WORST;                     /* Tentatively. */
10398
10399     RExC_parse--;
10400     nextchar(pRExC_state);
10401     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10402         flags &= ~TRYAGAIN;
10403         latest = regpiece(pRExC_state, &flags,depth+1);
10404         if (latest == NULL) {
10405             if (flags & TRYAGAIN)
10406                 continue;
10407             if (flags & RESTART_UTF8) {
10408                 *flagp = RESTART_UTF8;
10409                 return NULL;
10410             }
10411             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10412         }
10413         else if (ret == NULL)
10414             ret = latest;
10415         *flagp |= flags&(HASWIDTH|POSTPONED);
10416         if (chain == NULL)      /* First piece. */
10417             *flagp |= flags&SPSTART;
10418         else {
10419             RExC_naughty++;
10420             REGTAIL(pRExC_state, chain, latest);
10421         }
10422         chain = latest;
10423         c++;
10424     }
10425     if (chain == NULL) {        /* Loop ran zero times. */
10426         chain = reg_node(pRExC_state, NOTHING);
10427         if (ret == NULL)
10428             ret = chain;
10429     }
10430     if (c == 1) {
10431         *flagp |= flags&SIMPLE;
10432     }
10433
10434     return ret;
10435 }
10436
10437 /*
10438  - regpiece - something followed by possible [*+?]
10439  *
10440  * Note that the branching code sequences used for ? and the general cases
10441  * of * and + are somewhat optimized:  they use the same NOTHING node as
10442  * both the endmarker for their branch list and the body of the last branch.
10443  * It might seem that this node could be dispensed with entirely, but the
10444  * endmarker role is not redundant.
10445  *
10446  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10447  * TRYAGAIN.
10448  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10449  * restarted.
10450  */
10451 STATIC regnode *
10452 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10453 {
10454     dVAR;
10455     regnode *ret;
10456     char op;
10457     char *next;
10458     I32 flags;
10459     const char * const origparse = RExC_parse;
10460     I32 min;
10461     I32 max = REG_INFTY;
10462 #ifdef RE_TRACK_PATTERN_OFFSETS
10463     char *parse_start;
10464 #endif
10465     const char *maxpos = NULL;
10466
10467     /* Save the original in case we change the emitted regop to a FAIL. */
10468     regnode * const orig_emit = RExC_emit;
10469
10470     GET_RE_DEBUG_FLAGS_DECL;
10471
10472     PERL_ARGS_ASSERT_REGPIECE;
10473
10474     DEBUG_PARSE("piec");
10475
10476     ret = regatom(pRExC_state, &flags,depth+1);
10477     if (ret == NULL) {
10478         if (flags & (TRYAGAIN|RESTART_UTF8))
10479             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10480         else
10481             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10482         return(NULL);
10483     }
10484
10485     op = *RExC_parse;
10486
10487     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10488         maxpos = NULL;
10489 #ifdef RE_TRACK_PATTERN_OFFSETS
10490         parse_start = RExC_parse; /* MJD */
10491 #endif
10492         next = RExC_parse + 1;
10493         while (isDIGIT(*next) || *next == ',') {
10494             if (*next == ',') {
10495                 if (maxpos)
10496                     break;
10497                 else
10498                     maxpos = next;
10499             }
10500             next++;
10501         }
10502         if (*next == '}') {             /* got one */
10503             if (!maxpos)
10504                 maxpos = next;
10505             RExC_parse++;
10506             min = atoi(RExC_parse);
10507             if (*maxpos == ',')
10508                 maxpos++;
10509             else
10510                 maxpos = RExC_parse;
10511             max = atoi(maxpos);
10512             if (!max && *maxpos != '0')
10513                 max = REG_INFTY;                /* meaning "infinity" */
10514             else if (max >= REG_INFTY)
10515                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10516             RExC_parse = next;
10517             nextchar(pRExC_state);
10518             if (max < min) {    /* If can't match, warn and optimize to fail
10519                                    unconditionally */
10520                 if (SIZE_ONLY) {
10521                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10522
10523                     /* We can't back off the size because we have to reserve
10524                      * enough space for all the things we are about to throw
10525                      * away, but we can shrink it by the ammount we are about
10526                      * to re-use here */
10527                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10528                 }
10529                 else {
10530                     RExC_emit = orig_emit;
10531                 }
10532                 ret = reg_node(pRExC_state, OPFAIL);
10533                 return ret;
10534             }
10535             else if (min == max
10536                      && RExC_parse < RExC_end
10537                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10538             {
10539                 if (SIZE_ONLY) {
10540                     ckWARN2reg(RExC_parse + 1,
10541                                "Useless use of greediness modifier '%c'",
10542                                *RExC_parse);
10543                 }
10544                 /* Absorb the modifier, so later code doesn't see nor use
10545                     * it */
10546                 nextchar(pRExC_state);
10547             }
10548
10549         do_curly:
10550             if ((flags&SIMPLE)) {
10551                 RExC_naughty += 2 + RExC_naughty / 2;
10552                 reginsert(pRExC_state, CURLY, ret, depth+1);
10553                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10554                 Set_Node_Cur_Length(ret, parse_start);
10555             }
10556             else {
10557                 regnode * const w = reg_node(pRExC_state, WHILEM);
10558
10559                 w->flags = 0;
10560                 REGTAIL(pRExC_state, ret, w);
10561                 if (!SIZE_ONLY && RExC_extralen) {
10562                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10563                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10564                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10565                 }
10566                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10567                                 /* MJD hk */
10568                 Set_Node_Offset(ret, parse_start+1);
10569                 Set_Node_Length(ret,
10570                                 op == '{' ? (RExC_parse - parse_start) : 1);
10571
10572                 if (!SIZE_ONLY && RExC_extralen)
10573                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10574                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10575                 if (SIZE_ONLY)
10576                     RExC_whilem_seen++, RExC_extralen += 3;
10577                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10578             }
10579             ret->flags = 0;
10580
10581             if (min > 0)
10582                 *flagp = WORST;
10583             if (max > 0)
10584                 *flagp |= HASWIDTH;
10585             if (!SIZE_ONLY) {
10586                 ARG1_SET(ret, (U16)min);
10587                 ARG2_SET(ret, (U16)max);
10588             }
10589             if (max == REG_INFTY)
10590                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10591
10592             goto nest_check;
10593         }
10594     }
10595
10596     if (!ISMULT1(op)) {
10597         *flagp = flags;
10598         return(ret);
10599     }
10600
10601 #if 0                           /* Now runtime fix should be reliable. */
10602
10603     /* if this is reinstated, don't forget to put this back into perldiag:
10604
10605             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10606
10607            (F) The part of the regexp subject to either the * or + quantifier
10608            could match an empty string. The {#} shows in the regular
10609            expression about where the problem was discovered.
10610
10611     */
10612
10613     if (!(flags&HASWIDTH) && op != '?')
10614       vFAIL("Regexp *+ operand could be empty");
10615 #endif
10616
10617 #ifdef RE_TRACK_PATTERN_OFFSETS
10618     parse_start = RExC_parse;
10619 #endif
10620     nextchar(pRExC_state);
10621
10622     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10623
10624     if (op == '*' && (flags&SIMPLE)) {
10625         reginsert(pRExC_state, STAR, ret, depth+1);
10626         ret->flags = 0;
10627         RExC_naughty += 4;
10628         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10629     }
10630     else if (op == '*') {
10631         min = 0;
10632         goto do_curly;
10633     }
10634     else if (op == '+' && (flags&SIMPLE)) {
10635         reginsert(pRExC_state, PLUS, ret, depth+1);
10636         ret->flags = 0;
10637         RExC_naughty += 3;
10638         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10639     }
10640     else if (op == '+') {
10641         min = 1;
10642         goto do_curly;
10643     }
10644     else if (op == '?') {
10645         min = 0; max = 1;
10646         goto do_curly;
10647     }
10648   nest_check:
10649     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10650         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10651         ckWARN2reg(RExC_parse,
10652                    "%"UTF8f" matches null string many times",
10653                    UTF8fARG(UTF, (RExC_parse >= origparse
10654                                  ? RExC_parse - origparse
10655                                  : 0),
10656                    origparse));
10657         (void)ReREFCNT_inc(RExC_rx_sv);
10658     }
10659
10660     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10661         nextchar(pRExC_state);
10662         reginsert(pRExC_state, MINMOD, ret, depth+1);
10663         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10664     }
10665     else
10666     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10667         regnode *ender;
10668         nextchar(pRExC_state);
10669         ender = reg_node(pRExC_state, SUCCEED);
10670         REGTAIL(pRExC_state, ret, ender);
10671         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10672         ret->flags = 0;
10673         ender = reg_node(pRExC_state, TAIL);
10674         REGTAIL(pRExC_state, ret, ender);
10675     }
10676
10677     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10678         RExC_parse++;
10679         vFAIL("Nested quantifiers");
10680     }
10681
10682     return(ret);
10683 }
10684
10685 STATIC bool
10686 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10687                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10688                       const bool strict   /* Apply stricter parsing rules? */
10689     )
10690 {
10691
10692  /* This is expected to be called by a parser routine that has recognized '\N'
10693    and needs to handle the rest. RExC_parse is expected to point at the first
10694    char following the N at the time of the call.  On successful return,
10695    RExC_parse has been updated to point to just after the sequence identified
10696    by this routine, and <*flagp> has been updated.
10697
10698    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10699    character class.
10700
10701    \N may begin either a named sequence, or if outside a character class, mean
10702    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10703    attempted to decide which, and in the case of a named sequence, converted it
10704    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10705    where c1... are the characters in the sequence.  For single-quoted regexes,
10706    the tokenizer passes the \N sequence through unchanged; this code will not
10707    attempt to determine this nor expand those, instead raising a syntax error.
10708    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10709    or there is no '}', it signals that this \N occurrence means to match a
10710    non-newline.
10711
10712    Only the \N{U+...} form should occur in a character class, for the same
10713    reason that '.' inside a character class means to just match a period: it
10714    just doesn't make sense.
10715
10716    The function raises an error (via vFAIL), and doesn't return for various
10717    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10718    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10719    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10720    only possible if node_p is non-NULL.
10721
10722
10723    If <valuep> is non-null, it means the caller can accept an input sequence
10724    consisting of a just a single code point; <*valuep> is set to that value
10725    if the input is such.
10726
10727    If <node_p> is non-null it signifies that the caller can accept any other
10728    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10729    is set as follows:
10730     1) \N means not-a-NL: points to a newly created REG_ANY node;
10731     2) \N{}:              points to a new NOTHING node;
10732     3) otherwise:         points to a new EXACT node containing the resolved
10733                           string.
10734    Note that FALSE is returned for single code point sequences if <valuep> is
10735    null.
10736  */
10737
10738     char * endbrace;    /* '}' following the name */
10739     char* p;
10740     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10741                            stream */
10742     bool has_multiple_chars; /* true if the input stream contains a sequence of
10743                                 more than one character */
10744
10745     GET_RE_DEBUG_FLAGS_DECL;
10746
10747     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10748
10749     GET_RE_DEBUG_FLAGS;
10750
10751     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10752
10753     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10754      * modifier.  The other meaning does not, so use a temporary until we find
10755      * out which we are being called with */
10756     p = (RExC_flags & RXf_PMf_EXTENDED)
10757         ? regpatws(pRExC_state, RExC_parse,
10758                                 TRUE) /* means recognize comments */
10759         : RExC_parse;
10760
10761     /* Disambiguate between \N meaning a named character versus \N meaning
10762      * [^\n].  The former is assumed when it can't be the latter. */
10763     if (*p != '{' || regcurly(p, FALSE)) {
10764         RExC_parse = p;
10765         if (! node_p) {
10766             /* no bare \N allowed in a charclass */
10767             if (in_char_class) {
10768                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10769             }
10770             return FALSE;
10771         }
10772         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10773                            current char */
10774         nextchar(pRExC_state);
10775         *node_p = reg_node(pRExC_state, REG_ANY);
10776         *flagp |= HASWIDTH|SIMPLE;
10777         RExC_naughty++;
10778         Set_Node_Length(*node_p, 1); /* MJD */
10779         return TRUE;
10780     }
10781
10782     /* Here, we have decided it should be a named character or sequence */
10783
10784     /* The test above made sure that the next real character is a '{', but
10785      * under the /x modifier, it could be separated by space (or a comment and
10786      * \n) and this is not allowed (for consistency with \x{...} and the
10787      * tokenizer handling of \N{NAME}). */
10788     if (*RExC_parse != '{') {
10789         vFAIL("Missing braces on \\N{}");
10790     }
10791
10792     RExC_parse++;       /* Skip past the '{' */
10793
10794     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10795         || ! (endbrace == RExC_parse            /* nothing between the {} */
10796               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10797                                                  */
10798                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10799                                                      */
10800     {
10801         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10802         vFAIL("\\N{NAME} must be resolved by the lexer");
10803     }
10804
10805     if (endbrace == RExC_parse) {   /* empty: \N{} */
10806         bool ret = TRUE;
10807         if (node_p) {
10808             *node_p = reg_node(pRExC_state,NOTHING);
10809         }
10810         else if (in_char_class) {
10811             if (SIZE_ONLY && in_char_class) {
10812                 if (strict) {
10813                     RExC_parse++;   /* Position after the "}" */
10814                     vFAIL("Zero length \\N{}");
10815                 }
10816                 else {
10817                     ckWARNreg(RExC_parse,
10818                               "Ignoring zero length \\N{} in character class");
10819                 }
10820             }
10821             ret = FALSE;
10822         }
10823         else {
10824             return FALSE;
10825         }
10826         nextchar(pRExC_state);
10827         return ret;
10828     }
10829
10830     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10831     RExC_parse += 2;    /* Skip past the 'U+' */
10832
10833     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10834
10835     /* Code points are separated by dots.  If none, there is only one code
10836      * point, and is terminated by the brace */
10837     has_multiple_chars = (endchar < endbrace);
10838
10839     if (valuep && (! has_multiple_chars || in_char_class)) {
10840         /* We only pay attention to the first char of
10841         multichar strings being returned in char classes. I kinda wonder
10842         if this makes sense as it does change the behaviour
10843         from earlier versions, OTOH that behaviour was broken
10844         as well. XXX Solution is to recharacterize as
10845         [rest-of-class]|multi1|multi2... */
10846
10847         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10848         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10849             | PERL_SCAN_DISALLOW_PREFIX
10850             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10851
10852         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10853
10854         /* The tokenizer should have guaranteed validity, but it's possible to
10855          * bypass it by using single quoting, so check */
10856         if (length_of_hex == 0
10857             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10858         {
10859             RExC_parse += length_of_hex;        /* Includes all the valid */
10860             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10861                             ? UTF8SKIP(RExC_parse)
10862                             : 1;
10863             /* Guard against malformed utf8 */
10864             if (RExC_parse >= endchar) {
10865                 RExC_parse = endchar;
10866             }
10867             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10868         }
10869
10870         if (in_char_class && has_multiple_chars) {
10871             if (strict) {
10872                 RExC_parse = endbrace;
10873                 vFAIL("\\N{} in character class restricted to one character");
10874             }
10875             else {
10876                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10877             }
10878         }
10879
10880         RExC_parse = endbrace + 1;
10881     }
10882     else if (! node_p || ! has_multiple_chars) {
10883
10884         /* Here, the input is legal, but not according to the caller's
10885          * options.  We fail without advancing the parse, so that the
10886          * caller can try again */
10887         RExC_parse = p;
10888         return FALSE;
10889     }
10890     else {
10891
10892         /* What is done here is to convert this to a sub-pattern of the form
10893          * (?:\x{char1}\x{char2}...)
10894          * and then call reg recursively.  That way, it retains its atomicness,
10895          * while not having to worry about special handling that some code
10896          * points may have.  toke.c has converted the original Unicode values
10897          * to native, so that we can just pass on the hex values unchanged.  We
10898          * do have to set a flag to keep recoding from happening in the
10899          * recursion */
10900
10901         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10902         STRLEN len;
10903         char *orig_end = RExC_end;
10904         I32 flags;
10905
10906         while (RExC_parse < endbrace) {
10907
10908             /* Convert to notation the rest of the code understands */
10909             sv_catpv(substitute_parse, "\\x{");
10910             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10911             sv_catpv(substitute_parse, "}");
10912
10913             /* Point to the beginning of the next character in the sequence. */
10914             RExC_parse = endchar + 1;
10915             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10916         }
10917         sv_catpv(substitute_parse, ")");
10918
10919         RExC_parse = SvPV(substitute_parse, len);
10920
10921         /* Don't allow empty number */
10922         if (len < 8) {
10923             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10924         }
10925         RExC_end = RExC_parse + len;
10926
10927         /* The values are Unicode, and therefore not subject to recoding */
10928         RExC_override_recoding = 1;
10929
10930         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10931             if (flags & RESTART_UTF8) {
10932                 *flagp = RESTART_UTF8;
10933                 return FALSE;
10934             }
10935             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10936                   (UV) flags);
10937         }
10938         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10939
10940         RExC_parse = endbrace;
10941         RExC_end = orig_end;
10942         RExC_override_recoding = 0;
10943
10944         nextchar(pRExC_state);
10945     }
10946
10947     return TRUE;
10948 }
10949
10950
10951 /*
10952  * reg_recode
10953  *
10954  * It returns the code point in utf8 for the value in *encp.
10955  *    value: a code value in the source encoding
10956  *    encp:  a pointer to an Encode object
10957  *
10958  * If the result from Encode is not a single character,
10959  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10960  */
10961 STATIC UV
10962 S_reg_recode(pTHX_ const char value, SV **encp)
10963 {
10964     STRLEN numlen = 1;
10965     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10966     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10967     const STRLEN newlen = SvCUR(sv);
10968     UV uv = UNICODE_REPLACEMENT;
10969
10970     PERL_ARGS_ASSERT_REG_RECODE;
10971
10972     if (newlen)
10973         uv = SvUTF8(sv)
10974              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10975              : *(U8*)s;
10976
10977     if (!newlen || numlen != newlen) {
10978         uv = UNICODE_REPLACEMENT;
10979         *encp = NULL;
10980     }
10981     return uv;
10982 }
10983
10984 PERL_STATIC_INLINE U8
10985 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10986 {
10987     U8 op;
10988
10989     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10990
10991     if (! FOLD) {
10992         return EXACT;
10993     }
10994
10995     op = get_regex_charset(RExC_flags);
10996     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10997         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10998                  been, so there is no hole */
10999     }
11000
11001     return op + EXACTF;
11002 }
11003
11004 PERL_STATIC_INLINE void
11005 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11006                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11007                          bool downgradable)
11008 {
11009     /* This knows the details about sizing an EXACTish node, setting flags for
11010      * it (by setting <*flagp>, and potentially populating it with a single
11011      * character.
11012      *
11013      * If <len> (the length in bytes) is non-zero, this function assumes that
11014      * the node has already been populated, and just does the sizing.  In this
11015      * case <code_point> should be the final code point that has already been
11016      * placed into the node.  This value will be ignored except that under some
11017      * circumstances <*flagp> is set based on it.
11018      *
11019      * If <len> is zero, the function assumes that the node is to contain only
11020      * the single character given by <code_point> and calculates what <len>
11021      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11022      * additionally will populate the node's STRING with <code_point> or its
11023      * fold if folding.
11024      *
11025      * In both cases <*flagp> is appropriately set
11026      *
11027      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11028      * 255, must be folded (the former only when the rules indicate it can
11029      * match 'ss')
11030      *
11031      * When it does the populating, it looks at the flag 'downgradable'.  If
11032      * true with a node that folds, it checks if the single code point
11033      * participates in a fold, and if not downgrades the node to an EXACT.
11034      * This helps the optimizer */
11035
11036     bool len_passed_in = cBOOL(len != 0);
11037     U8 character[UTF8_MAXBYTES_CASE+1];
11038
11039     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11040
11041     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11042      * sizing difference, and is extra work that is thrown away */
11043     if (downgradable && ! PASS2) {
11044         downgradable = FALSE;
11045     }
11046
11047     if (! len_passed_in) {
11048         if (UTF) {
11049             if (UNI_IS_INVARIANT(code_point)) {
11050                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11051                     *character = (U8) code_point;
11052                 }
11053                 else { /* Here is /i and not /l (toFOLD() is defined on just
11054                           ASCII, which isn't the same thing as INVARIANT on
11055                           EBCDIC, but it works there, as the extra invariants
11056                           fold to themselves) */
11057                     *character = toFOLD((U8) code_point);
11058                     if (downgradable
11059                         && *character == code_point
11060                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11061                     {
11062                         OP(node) = EXACT;
11063                     }
11064                 }
11065                 len = 1;
11066             }
11067             else if (FOLD && (! LOC
11068                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11069             {   /* Folding, and ok to do so now */
11070                 UV folded = _to_uni_fold_flags(
11071                                    code_point,
11072                                    character,
11073                                    &len,
11074                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11075                                                       ? FOLD_FLAGS_NOMIX_ASCII
11076                                                       : 0));
11077                 if (downgradable
11078                     && folded == code_point
11079                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11080                 {
11081                     OP(node) = EXACT;
11082                 }
11083             }
11084             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11085
11086                 /* Not folding this cp, and can output it directly */
11087                 *character = UTF8_TWO_BYTE_HI(code_point);
11088                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11089                 len = 2;
11090             }
11091             else {
11092                 uvchr_to_utf8( character, code_point);
11093                 len = UTF8SKIP(character);
11094             }
11095         } /* Else pattern isn't UTF8.  */
11096         else if (! FOLD) {
11097             *character = (U8) code_point;
11098             len = 1;
11099         } /* Else is folded non-UTF8 */
11100         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11101
11102             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11103              * comments at join_exact()); */
11104             *character = (U8) code_point;
11105             len = 1;
11106
11107             /* Can turn into an EXACT node if we know the fold at compile time,
11108              * and it folds to itself and doesn't particpate in other folds */
11109             if (downgradable
11110                 && ! LOC
11111                 && PL_fold_latin1[code_point] == code_point
11112                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11113                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11114             {
11115                 OP(node) = EXACT;
11116             }
11117         } /* else is Sharp s.  May need to fold it */
11118         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11119             *character = 's';
11120             *(character + 1) = 's';
11121             len = 2;
11122         }
11123         else {
11124             *character = LATIN_SMALL_LETTER_SHARP_S;
11125             len = 1;
11126         }
11127     }
11128
11129     if (SIZE_ONLY) {
11130         RExC_size += STR_SZ(len);
11131     }
11132     else {
11133         RExC_emit += STR_SZ(len);
11134         STR_LEN(node) = len;
11135         if (! len_passed_in) {
11136             Copy((char *) character, STRING(node), len, char);
11137         }
11138     }
11139
11140     *flagp |= HASWIDTH;
11141
11142     /* A single character node is SIMPLE, except for the special-cased SHARP S
11143      * under /di. */
11144     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11145         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11146             || ! FOLD || ! DEPENDS_SEMANTICS))
11147     {
11148         *flagp |= SIMPLE;
11149     }
11150
11151     /* The OP may not be well defined in PASS1 */
11152     if (PASS2 && OP(node) == EXACTFL) {
11153         RExC_contains_locale = 1;
11154     }
11155 }
11156
11157
11158 /* return atoi(p), unless it's too big to sensibly be a backref,
11159  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11160
11161 static I32
11162 S_backref_value(char *p)
11163 {
11164     char *q = p;
11165
11166     for (;isDIGIT(*q); q++) {} /* calculate length of num */
11167     if (q - p == 0 || q - p > 9)
11168         return I32_MAX;
11169     return atoi(p);
11170 }
11171
11172
11173 /*
11174  - regatom - the lowest level
11175
11176    Try to identify anything special at the start of the pattern. If there
11177    is, then handle it as required. This may involve generating a single regop,
11178    such as for an assertion; or it may involve recursing, such as to
11179    handle a () structure.
11180
11181    If the string doesn't start with something special then we gobble up
11182    as much literal text as we can.
11183
11184    Once we have been able to handle whatever type of thing started the
11185    sequence, we return.
11186
11187    Note: we have to be careful with escapes, as they can be both literal
11188    and special, and in the case of \10 and friends, context determines which.
11189
11190    A summary of the code structure is:
11191
11192    switch (first_byte) {
11193         cases for each special:
11194             handle this special;
11195             break;
11196         case '\\':
11197             switch (2nd byte) {
11198                 cases for each unambiguous special:
11199                     handle this special;
11200                     break;
11201                 cases for each ambigous special/literal:
11202                     disambiguate;
11203                     if (special)  handle here
11204                     else goto defchar;
11205                 default: // unambiguously literal:
11206                     goto defchar;
11207             }
11208         default:  // is a literal char
11209             // FALL THROUGH
11210         defchar:
11211             create EXACTish node for literal;
11212             while (more input and node isn't full) {
11213                 switch (input_byte) {
11214                    cases for each special;
11215                        make sure parse pointer is set so that the next call to
11216                            regatom will see this special first
11217                        goto loopdone; // EXACTish node terminated by prev. char
11218                    default:
11219                        append char to EXACTISH node;
11220                 }
11221                 get next input byte;
11222             }
11223         loopdone:
11224    }
11225    return the generated node;
11226
11227    Specifically there are two separate switches for handling
11228    escape sequences, with the one for handling literal escapes requiring
11229    a dummy entry for all of the special escapes that are actually handled
11230    by the other.
11231
11232    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11233    TRYAGAIN.
11234    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11235    restarted.
11236    Otherwise does not return NULL.
11237 */
11238
11239 STATIC regnode *
11240 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11241 {
11242     dVAR;
11243     regnode *ret = NULL;
11244     I32 flags = 0;
11245     char *parse_start = RExC_parse;
11246     U8 op;
11247     int invert = 0;
11248
11249     GET_RE_DEBUG_FLAGS_DECL;
11250
11251     *flagp = WORST;             /* Tentatively. */
11252
11253     DEBUG_PARSE("atom");
11254
11255     PERL_ARGS_ASSERT_REGATOM;
11256
11257 tryagain:
11258     switch ((U8)*RExC_parse) {
11259     case '^':
11260         RExC_seen_zerolen++;
11261         nextchar(pRExC_state);
11262         if (RExC_flags & RXf_PMf_MULTILINE)
11263             ret = reg_node(pRExC_state, MBOL);
11264         else if (RExC_flags & RXf_PMf_SINGLELINE)
11265             ret = reg_node(pRExC_state, SBOL);
11266         else
11267             ret = reg_node(pRExC_state, BOL);
11268         Set_Node_Length(ret, 1); /* MJD */
11269         break;
11270     case '$':
11271         nextchar(pRExC_state);
11272         if (*RExC_parse)
11273             RExC_seen_zerolen++;
11274         if (RExC_flags & RXf_PMf_MULTILINE)
11275             ret = reg_node(pRExC_state, MEOL);
11276         else if (RExC_flags & RXf_PMf_SINGLELINE)
11277             ret = reg_node(pRExC_state, SEOL);
11278         else
11279             ret = reg_node(pRExC_state, EOL);
11280         Set_Node_Length(ret, 1); /* MJD */
11281         break;
11282     case '.':
11283         nextchar(pRExC_state);
11284         if (RExC_flags & RXf_PMf_SINGLELINE)
11285             ret = reg_node(pRExC_state, SANY);
11286         else
11287             ret = reg_node(pRExC_state, REG_ANY);
11288         *flagp |= HASWIDTH|SIMPLE;
11289         RExC_naughty++;
11290         Set_Node_Length(ret, 1); /* MJD */
11291         break;
11292     case '[':
11293     {
11294         char * const oregcomp_parse = ++RExC_parse;
11295         ret = regclass(pRExC_state, flagp,depth+1,
11296                        FALSE, /* means parse the whole char class */
11297                        TRUE, /* allow multi-char folds */
11298                        FALSE, /* don't silence non-portable warnings. */
11299                        NULL);
11300         if (*RExC_parse != ']') {
11301             RExC_parse = oregcomp_parse;
11302             vFAIL("Unmatched [");
11303         }
11304         if (ret == NULL) {
11305             if (*flagp & RESTART_UTF8)
11306                 return NULL;
11307             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11308                   (UV) *flagp);
11309         }
11310         nextchar(pRExC_state);
11311         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11312         break;
11313     }
11314     case '(':
11315         nextchar(pRExC_state);
11316         ret = reg(pRExC_state, 2, &flags,depth+1);
11317         if (ret == NULL) {
11318                 if (flags & TRYAGAIN) {
11319                     if (RExC_parse == RExC_end) {
11320                          /* Make parent create an empty node if needed. */
11321                         *flagp |= TRYAGAIN;
11322                         return(NULL);
11323                     }
11324                     goto tryagain;
11325                 }
11326                 if (flags & RESTART_UTF8) {
11327                     *flagp = RESTART_UTF8;
11328                     return NULL;
11329                 }
11330                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11331                                                                  (UV) flags);
11332         }
11333         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11334         break;
11335     case '|':
11336     case ')':
11337         if (flags & TRYAGAIN) {
11338             *flagp |= TRYAGAIN;
11339             return NULL;
11340         }
11341         vFAIL("Internal urp");
11342                                 /* Supposed to be caught earlier. */
11343         break;
11344     case '{':
11345         if (!regcurly(RExC_parse, FALSE)) {
11346             RExC_parse++;
11347             goto defchar;
11348         }
11349         /* FALLTHROUGH */
11350     case '?':
11351     case '+':
11352     case '*':
11353         RExC_parse++;
11354         vFAIL("Quantifier follows nothing");
11355         break;
11356     case '\\':
11357         /* Special Escapes
11358
11359            This switch handles escape sequences that resolve to some kind
11360            of special regop and not to literal text. Escape sequnces that
11361            resolve to literal text are handled below in the switch marked
11362            "Literal Escapes".
11363
11364            Every entry in this switch *must* have a corresponding entry
11365            in the literal escape switch. However, the opposite is not
11366            required, as the default for this switch is to jump to the
11367            literal text handling code.
11368         */
11369         switch ((U8)*++RExC_parse) {
11370             U8 arg;
11371         /* Special Escapes */
11372         case 'A':
11373             RExC_seen_zerolen++;
11374             ret = reg_node(pRExC_state, SBOL);
11375             *flagp |= SIMPLE;
11376             goto finish_meta_pat;
11377         case 'G':
11378             ret = reg_node(pRExC_state, GPOS);
11379             RExC_seen |= REG_GPOS_SEEN;
11380             *flagp |= SIMPLE;
11381             goto finish_meta_pat;
11382         case 'K':
11383             RExC_seen_zerolen++;
11384             ret = reg_node(pRExC_state, KEEPS);
11385             *flagp |= SIMPLE;
11386             /* XXX:dmq : disabling in-place substitution seems to
11387              * be necessary here to avoid cases of memory corruption, as
11388              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11389              */
11390             RExC_seen |= REG_LOOKBEHIND_SEEN;
11391             goto finish_meta_pat;
11392         case 'Z':
11393             ret = reg_node(pRExC_state, SEOL);
11394             *flagp |= SIMPLE;
11395             RExC_seen_zerolen++;                /* Do not optimize RE away */
11396             goto finish_meta_pat;
11397         case 'z':
11398             ret = reg_node(pRExC_state, EOS);
11399             *flagp |= SIMPLE;
11400             RExC_seen_zerolen++;                /* Do not optimize RE away */
11401             goto finish_meta_pat;
11402         case 'C':
11403             ret = reg_node(pRExC_state, CANY);
11404             RExC_seen |= REG_CANY_SEEN;
11405             *flagp |= HASWIDTH|SIMPLE;
11406             goto finish_meta_pat;
11407         case 'X':
11408             ret = reg_node(pRExC_state, CLUMP);
11409             *flagp |= HASWIDTH;
11410             goto finish_meta_pat;
11411
11412         case 'W':
11413             invert = 1;
11414             /* FALLTHROUGH */
11415         case 'w':
11416             arg = ANYOF_WORDCHAR;
11417             goto join_posix;
11418
11419         case 'b':
11420             RExC_seen_zerolen++;
11421             RExC_seen |= REG_LOOKBEHIND_SEEN;
11422             op = BOUND + get_regex_charset(RExC_flags);
11423             if (op > BOUNDA) {  /* /aa is same as /a */
11424                 op = BOUNDA;
11425             }
11426             else if (op == BOUNDL) {
11427                 RExC_contains_locale = 1;
11428             }
11429             ret = reg_node(pRExC_state, op);
11430             FLAGS(ret) = get_regex_charset(RExC_flags);
11431             *flagp |= SIMPLE;
11432             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11433                 /* diag_listed_as: Use "%s" instead of "%s" */
11434                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11435             }
11436             goto finish_meta_pat;
11437         case 'B':
11438             RExC_seen_zerolen++;
11439             RExC_seen |= REG_LOOKBEHIND_SEEN;
11440             op = NBOUND + get_regex_charset(RExC_flags);
11441             if (op > NBOUNDA) { /* /aa is same as /a */
11442                 op = NBOUNDA;
11443             }
11444             else if (op == NBOUNDL) {
11445                 RExC_contains_locale = 1;
11446             }
11447             ret = reg_node(pRExC_state, op);
11448             FLAGS(ret) = get_regex_charset(RExC_flags);
11449             *flagp |= SIMPLE;
11450             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11451                 /* diag_listed_as: Use "%s" instead of "%s" */
11452                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11453             }
11454             goto finish_meta_pat;
11455
11456         case 'D':
11457             invert = 1;
11458             /* FALLTHROUGH */
11459         case 'd':
11460             arg = ANYOF_DIGIT;
11461             goto join_posix;
11462
11463         case 'R':
11464             ret = reg_node(pRExC_state, LNBREAK);
11465             *flagp |= HASWIDTH|SIMPLE;
11466             goto finish_meta_pat;
11467
11468         case 'H':
11469             invert = 1;
11470             /* FALLTHROUGH */
11471         case 'h':
11472             arg = ANYOF_BLANK;
11473             op = POSIXU;
11474             goto join_posix_op_known;
11475
11476         case 'V':
11477             invert = 1;
11478             /* FALLTHROUGH */
11479         case 'v':
11480             arg = ANYOF_VERTWS;
11481             op = POSIXU;
11482             goto join_posix_op_known;
11483
11484         case 'S':
11485             invert = 1;
11486             /* FALLTHROUGH */
11487         case 's':
11488             arg = ANYOF_SPACE;
11489
11490         join_posix:
11491
11492             op = POSIXD + get_regex_charset(RExC_flags);
11493             if (op > POSIXA) {  /* /aa is same as /a */
11494                 op = POSIXA;
11495             }
11496             else if (op == POSIXL) {
11497                 RExC_contains_locale = 1;
11498             }
11499
11500         join_posix_op_known:
11501
11502             if (invert) {
11503                 op += NPOSIXD - POSIXD;
11504             }
11505
11506             ret = reg_node(pRExC_state, op);
11507             if (! SIZE_ONLY) {
11508                 FLAGS(ret) = namedclass_to_classnum(arg);
11509             }
11510
11511             *flagp |= HASWIDTH|SIMPLE;
11512             /* FALLTHROUGH */
11513
11514          finish_meta_pat:
11515             nextchar(pRExC_state);
11516             Set_Node_Length(ret, 2); /* MJD */
11517             break;
11518         case 'p':
11519         case 'P':
11520             {
11521 #ifdef DEBUGGING
11522                 char* parse_start = RExC_parse - 2;
11523 #endif
11524
11525                 RExC_parse--;
11526
11527                 ret = regclass(pRExC_state, flagp,depth+1,
11528                                TRUE, /* means just parse this element */
11529                                FALSE, /* don't allow multi-char folds */
11530                                FALSE, /* don't silence non-portable warnings.
11531                                          It would be a bug if these returned
11532                                          non-portables */
11533                                NULL);
11534                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11535                    are allowed.  */
11536                 if (!ret)
11537                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11538                           (UV) *flagp);
11539
11540                 RExC_parse--;
11541
11542                 Set_Node_Offset(ret, parse_start + 2);
11543                 Set_Node_Cur_Length(ret, parse_start);
11544                 nextchar(pRExC_state);
11545             }
11546             break;
11547         case 'N':
11548             /* Handle \N and \N{NAME} with multiple code points here and not
11549              * below because it can be multicharacter. join_exact() will join
11550              * them up later on.  Also this makes sure that things like
11551              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11552              * The options to the grok function call causes it to fail if the
11553              * sequence is just a single code point.  We then go treat it as
11554              * just another character in the current EXACT node, and hence it
11555              * gets uniform treatment with all the other characters.  The
11556              * special treatment for quantifiers is not needed for such single
11557              * character sequences */
11558             ++RExC_parse;
11559             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11560                                 FALSE /* not strict */ )) {
11561                 if (*flagp & RESTART_UTF8)
11562                     return NULL;
11563                 RExC_parse--;
11564                 goto defchar;
11565             }
11566             break;
11567         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11568         parse_named_seq:
11569         {
11570             char ch= RExC_parse[1];
11571             if (ch != '<' && ch != '\'' && ch != '{') {
11572                 RExC_parse++;
11573                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11574                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11575             } else {
11576                 /* this pretty much dupes the code for (?P=...) in reg(), if
11577                    you change this make sure you change that */
11578                 char* name_start = (RExC_parse += 2);
11579                 U32 num = 0;
11580                 SV *sv_dat = reg_scan_name(pRExC_state,
11581                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11582                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11583                 if (RExC_parse == name_start || *RExC_parse != ch)
11584                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11585                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11586
11587                 if (!SIZE_ONLY) {
11588                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11589                     RExC_rxi->data->data[num]=(void*)sv_dat;
11590                     SvREFCNT_inc_simple_void(sv_dat);
11591                 }
11592
11593                 RExC_sawback = 1;
11594                 ret = reganode(pRExC_state,
11595                                ((! FOLD)
11596                                  ? NREF
11597                                  : (ASCII_FOLD_RESTRICTED)
11598                                    ? NREFFA
11599                                    : (AT_LEAST_UNI_SEMANTICS)
11600                                      ? NREFFU
11601                                      : (LOC)
11602                                        ? NREFFL
11603                                        : NREFF),
11604                                 num);
11605                 *flagp |= HASWIDTH;
11606
11607                 /* override incorrect value set in reganode MJD */
11608                 Set_Node_Offset(ret, parse_start+1);
11609                 Set_Node_Cur_Length(ret, parse_start);
11610                 nextchar(pRExC_state);
11611
11612             }
11613             break;
11614         }
11615         case 'g':
11616         case '1': case '2': case '3': case '4':
11617         case '5': case '6': case '7': case '8': case '9':
11618             {
11619                 I32 num;
11620                 bool hasbrace = 0;
11621
11622                 if (*RExC_parse == 'g') {
11623                     bool isrel = 0;
11624
11625                     RExC_parse++;
11626                     if (*RExC_parse == '{') {
11627                         RExC_parse++;
11628                         hasbrace = 1;
11629                     }
11630                     if (*RExC_parse == '-') {
11631                         RExC_parse++;
11632                         isrel = 1;
11633                     }
11634                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11635                         if (isrel) RExC_parse--;
11636                         RExC_parse -= 2;
11637                         goto parse_named_seq;
11638                     }
11639
11640                     num = S_backref_value(RExC_parse);
11641                     if (num == 0)
11642                         vFAIL("Reference to invalid group 0");
11643                     else if (num == I32_MAX) {
11644                          if (isDIGIT(*RExC_parse))
11645                             vFAIL("Reference to nonexistent group");
11646                         else
11647                             vFAIL("Unterminated \\g... pattern");
11648                     }
11649
11650                     if (isrel) {
11651                         num = RExC_npar - num;
11652                         if (num < 1)
11653                             vFAIL("Reference to nonexistent or unclosed group");
11654                     }
11655                 }
11656                 else {
11657                     num = S_backref_value(RExC_parse);
11658                     /* bare \NNN might be backref or octal - if it is larger than or equal
11659                      * RExC_npar then it is assumed to be and octal escape.
11660                      * Note RExC_npar is +1 from the actual number of parens*/
11661                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11662                             && *RExC_parse != '8' && *RExC_parse != '9'))
11663                     {
11664                         /* Probably a character specified in octal, e.g. \35 */
11665                         goto defchar;
11666                     }
11667                 }
11668
11669                 /* at this point RExC_parse definitely points to a backref
11670                  * number */
11671                 {
11672 #ifdef RE_TRACK_PATTERN_OFFSETS
11673                     char * const parse_start = RExC_parse - 1; /* MJD */
11674 #endif
11675                     while (isDIGIT(*RExC_parse))
11676                         RExC_parse++;
11677                     if (hasbrace) {
11678                         if (*RExC_parse != '}')
11679                             vFAIL("Unterminated \\g{...} pattern");
11680                         RExC_parse++;
11681                     }
11682                     if (!SIZE_ONLY) {
11683                         if (num > (I32)RExC_rx->nparens)
11684                             vFAIL("Reference to nonexistent group");
11685                     }
11686                     RExC_sawback = 1;
11687                     ret = reganode(pRExC_state,
11688                                    ((! FOLD)
11689                                      ? REF
11690                                      : (ASCII_FOLD_RESTRICTED)
11691                                        ? REFFA
11692                                        : (AT_LEAST_UNI_SEMANTICS)
11693                                          ? REFFU
11694                                          : (LOC)
11695                                            ? REFFL
11696                                            : REFF),
11697                                     num);
11698                     *flagp |= HASWIDTH;
11699
11700                     /* override incorrect value set in reganode MJD */
11701                     Set_Node_Offset(ret, parse_start+1);
11702                     Set_Node_Cur_Length(ret, parse_start);
11703                     RExC_parse--;
11704                     nextchar(pRExC_state);
11705                 }
11706             }
11707             break;
11708         case '\0':
11709             if (RExC_parse >= RExC_end)
11710                 FAIL("Trailing \\");
11711             /* FALLTHROUGH */
11712         default:
11713             /* Do not generate "unrecognized" warnings here, we fall
11714                back into the quick-grab loop below */
11715             parse_start--;
11716             goto defchar;
11717         }
11718         break;
11719
11720     case '#':
11721         if (RExC_flags & RXf_PMf_EXTENDED) {
11722             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11723             if (RExC_parse < RExC_end)
11724                 goto tryagain;
11725         }
11726         /* FALLTHROUGH */
11727
11728     default:
11729
11730             parse_start = RExC_parse - 1;
11731
11732             RExC_parse++;
11733
11734         defchar: {
11735             STRLEN len = 0;
11736             UV ender = 0;
11737             char *p;
11738             char *s;
11739 #define MAX_NODE_STRING_SIZE 127
11740             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11741             char *s0;
11742             U8 upper_parse = MAX_NODE_STRING_SIZE;
11743             U8 node_type = compute_EXACTish(pRExC_state);
11744             bool next_is_quantifier;
11745             char * oldp = NULL;
11746
11747             /* We can convert EXACTF nodes to EXACTFU if they contain only
11748              * characters that match identically regardless of the target
11749              * string's UTF8ness.  The reason to do this is that EXACTF is not
11750              * trie-able, EXACTFU is.
11751              *
11752              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11753              * contain only above-Latin1 characters (hence must be in UTF8),
11754              * which don't participate in folds with Latin1-range characters,
11755              * as the latter's folds aren't known until runtime.  (We don't
11756              * need to figure this out until pass 2) */
11757             bool maybe_exactfu = PASS2
11758                                && (node_type == EXACTF || node_type == EXACTFL);
11759
11760             /* If a folding node contains only code points that don't
11761              * participate in folds, it can be changed into an EXACT node,
11762              * which allows the optimizer more things to look for */
11763             bool maybe_exact;
11764
11765             ret = reg_node(pRExC_state, node_type);
11766
11767             /* In pass1, folded, we use a temporary buffer instead of the
11768              * actual node, as the node doesn't exist yet */
11769             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11770
11771             s0 = s;
11772
11773         reparse:
11774
11775             /* We do the EXACTFish to EXACT node only if folding.  (And we
11776              * don't need to figure this out until pass 2) */
11777             maybe_exact = FOLD && PASS2;
11778
11779             /* XXX The node can hold up to 255 bytes, yet this only goes to
11780              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11781              * 255 allows us to not have to worry about overflow due to
11782              * converting to utf8 and fold expansion, but that value is
11783              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11784              * split up by this limit into a single one using the real max of
11785              * 255.  Even at 127, this breaks under rare circumstances.  If
11786              * folding, we do not want to split a node at a character that is a
11787              * non-final in a multi-char fold, as an input string could just
11788              * happen to want to match across the node boundary.  The join
11789              * would solve that problem if the join actually happens.  But a
11790              * series of more than two nodes in a row each of 127 would cause
11791              * the first join to succeed to get to 254, but then there wouldn't
11792              * be room for the next one, which could at be one of those split
11793              * multi-char folds.  I don't know of any fool-proof solution.  One
11794              * could back off to end with only a code point that isn't such a
11795              * non-final, but it is possible for there not to be any in the
11796              * entire node. */
11797             for (p = RExC_parse - 1;
11798                  len < upper_parse && p < RExC_end;
11799                  len++)
11800             {
11801                 oldp = p;
11802
11803                 if (RExC_flags & RXf_PMf_EXTENDED)
11804                     p = regpatws(pRExC_state, p,
11805                                           TRUE); /* means recognize comments */
11806                 switch ((U8)*p) {
11807                 case '^':
11808                 case '$':
11809                 case '.':
11810                 case '[':
11811                 case '(':
11812                 case ')':
11813                 case '|':
11814                     goto loopdone;
11815                 case '\\':
11816                     /* Literal Escapes Switch
11817
11818                        This switch is meant to handle escape sequences that
11819                        resolve to a literal character.
11820
11821                        Every escape sequence that represents something
11822                        else, like an assertion or a char class, is handled
11823                        in the switch marked 'Special Escapes' above in this
11824                        routine, but also has an entry here as anything that
11825                        isn't explicitly mentioned here will be treated as
11826                        an unescaped equivalent literal.
11827                     */
11828
11829                     switch ((U8)*++p) {
11830                     /* These are all the special escapes. */
11831                     case 'A':             /* Start assertion */
11832                     case 'b': case 'B':   /* Word-boundary assertion*/
11833                     case 'C':             /* Single char !DANGEROUS! */
11834                     case 'd': case 'D':   /* digit class */
11835                     case 'g': case 'G':   /* generic-backref, pos assertion */
11836                     case 'h': case 'H':   /* HORIZWS */
11837                     case 'k': case 'K':   /* named backref, keep marker */
11838                     case 'p': case 'P':   /* Unicode property */
11839                               case 'R':   /* LNBREAK */
11840                     case 's': case 'S':   /* space class */
11841                     case 'v': case 'V':   /* VERTWS */
11842                     case 'w': case 'W':   /* word class */
11843                     case 'X':             /* eXtended Unicode "combining
11844                                              character sequence" */
11845                     case 'z': case 'Z':   /* End of line/string assertion */
11846                         --p;
11847                         goto loopdone;
11848
11849                     /* Anything after here is an escape that resolves to a
11850                        literal. (Except digits, which may or may not)
11851                      */
11852                     case 'n':
11853                         ender = '\n';
11854                         p++;
11855                         break;
11856                     case 'N': /* Handle a single-code point named character. */
11857                         /* The options cause it to fail if a multiple code
11858                          * point sequence.  Handle those in the switch() above
11859                          * */
11860                         RExC_parse = p + 1;
11861                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11862                                             flagp, depth, FALSE,
11863                                             FALSE /* not strict */ ))
11864                         {
11865                             if (*flagp & RESTART_UTF8)
11866                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11867                             RExC_parse = p = oldp;
11868                             goto loopdone;
11869                         }
11870                         p = RExC_parse;
11871                         if (ender > 0xff) {
11872                             REQUIRE_UTF8;
11873                         }
11874                         break;
11875                     case 'r':
11876                         ender = '\r';
11877                         p++;
11878                         break;
11879                     case 't':
11880                         ender = '\t';
11881                         p++;
11882                         break;
11883                     case 'f':
11884                         ender = '\f';
11885                         p++;
11886                         break;
11887                     case 'e':
11888                           ender = ASCII_TO_NATIVE('\033');
11889                         p++;
11890                         break;
11891                     case 'a':
11892                           ender = '\a';
11893                         p++;
11894                         break;
11895                     case 'o':
11896                         {
11897                             UV result;
11898                             const char* error_msg;
11899
11900                             bool valid = grok_bslash_o(&p,
11901                                                        &result,
11902                                                        &error_msg,
11903                                                        TRUE, /* out warnings */
11904                                                        FALSE, /* not strict */
11905                                                        TRUE, /* Output warnings
11906                                                                 for non-
11907                                                                 portables */
11908                                                        UTF);
11909                             if (! valid) {
11910                                 RExC_parse = p; /* going to die anyway; point
11911                                                    to exact spot of failure */
11912                                 vFAIL(error_msg);
11913                             }
11914                             ender = result;
11915                             if (PL_encoding && ender < 0x100) {
11916                                 goto recode_encoding;
11917                             }
11918                             if (ender > 0xff) {
11919                                 REQUIRE_UTF8;
11920                             }
11921                             break;
11922                         }
11923                     case 'x':
11924                         {
11925                             UV result = UV_MAX; /* initialize to erroneous
11926                                                    value */
11927                             const char* error_msg;
11928
11929                             bool valid = grok_bslash_x(&p,
11930                                                        &result,
11931                                                        &error_msg,
11932                                                        TRUE, /* out warnings */
11933                                                        FALSE, /* not strict */
11934                                                        TRUE, /* Output warnings
11935                                                                 for non-
11936                                                                 portables */
11937                                                        UTF);
11938                             if (! valid) {
11939                                 RExC_parse = p; /* going to die anyway; point
11940                                                    to exact spot of failure */
11941                                 vFAIL(error_msg);
11942                             }
11943                             ender = result;
11944
11945                             if (PL_encoding && ender < 0x100) {
11946                                 goto recode_encoding;
11947                             }
11948                             if (ender > 0xff) {
11949                                 REQUIRE_UTF8;
11950                             }
11951                             break;
11952                         }
11953                     case 'c':
11954                         p++;
11955                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11956                         break;
11957                     case '8': case '9': /* must be a backreference */
11958                         --p;
11959                         goto loopdone;
11960                     case '1': case '2': case '3':case '4':
11961                     case '5': case '6': case '7':
11962                         /* When we parse backslash escapes there is ambiguity
11963                          * between backreferences and octal escapes. Any escape
11964                          * from \1 - \9 is a backreference, any multi-digit
11965                          * escape which does not start with 0 and which when
11966                          * evaluated as decimal could refer to an already
11967                          * parsed capture buffer is a backslash. Anything else
11968                          * is octal.
11969                          *
11970                          * Note this implies that \118 could be interpreted as
11971                          * 118 OR as "\11" . "8" depending on whether there
11972                          * were 118 capture buffers defined already in the
11973                          * pattern.  */
11974
11975                         /* NOTE, RExC_npar is 1 more than the actual number of
11976                          * parens we have seen so far, hence the < RExC_npar below. */
11977
11978                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11979                         {  /* Not to be treated as an octal constant, go
11980                                    find backref */
11981                             --p;
11982                             goto loopdone;
11983                         }
11984                         /* FALLTHROUGH */
11985                     case '0':
11986                         {
11987                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11988                             STRLEN numlen = 3;
11989                             ender = grok_oct(p, &numlen, &flags, NULL);
11990                             if (ender > 0xff) {
11991                                 REQUIRE_UTF8;
11992                             }
11993                             p += numlen;
11994                             if (SIZE_ONLY   /* like \08, \178 */
11995                                 && numlen < 3
11996                                 && p < RExC_end
11997                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11998                             {
11999                                 reg_warn_non_literal_string(
12000                                          p + 1,
12001                                          form_short_octal_warning(p, numlen));
12002                             }
12003                         }
12004                         if (PL_encoding && ender < 0x100)
12005                             goto recode_encoding;
12006                         break;
12007                     recode_encoding:
12008                         if (! RExC_override_recoding) {
12009                             SV* enc = PL_encoding;
12010                             ender = reg_recode((const char)(U8)ender, &enc);
12011                             if (!enc && SIZE_ONLY)
12012                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12013                             REQUIRE_UTF8;
12014                         }
12015                         break;
12016                     case '\0':
12017                         if (p >= RExC_end)
12018                             FAIL("Trailing \\");
12019                         /* FALLTHROUGH */
12020                     default:
12021                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12022                             /* Include any { following the alpha to emphasize
12023                              * that it could be part of an escape at some point
12024                              * in the future */
12025                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12026                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12027                         }
12028                         goto normal_default;
12029                     } /* End of switch on '\' */
12030                     break;
12031                 default:    /* A literal character */
12032
12033                   normal_default:
12034                     if (UTF8_IS_START(*p) && UTF) {
12035                         STRLEN numlen;
12036                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12037                                                &numlen, UTF8_ALLOW_DEFAULT);
12038                         p += numlen;
12039                     }
12040                     else
12041                         ender = (U8) *p++;
12042                     break;
12043                 } /* End of switch on the literal */
12044
12045                 /* Here, have looked at the literal character and <ender>
12046                  * contains its ordinal, <p> points to the character after it
12047                  */
12048
12049                 if ( RExC_flags & RXf_PMf_EXTENDED)
12050                     p = regpatws(pRExC_state, p,
12051                                           TRUE); /* means recognize comments */
12052
12053                 /* If the next thing is a quantifier, it applies to this
12054                  * character only, which means that this character has to be in
12055                  * its own node and can't just be appended to the string in an
12056                  * existing node, so if there are already other characters in
12057                  * the node, close the node with just them, and set up to do
12058                  * this character again next time through, when it will be the
12059                  * only thing in its new node */
12060                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12061                 {
12062                     p = oldp;
12063                     goto loopdone;
12064                 }
12065
12066                 if (! FOLD   /* The simple case, just append the literal */
12067                     || (LOC  /* Also don't fold for tricky chars under /l */
12068                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12069                 {
12070                     if (UTF) {
12071                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12072                         if (unilen > 0) {
12073                            s   += unilen;
12074                            len += unilen;
12075                         }
12076
12077                         /* The loop increments <len> each time, as all but this
12078                          * path (and one other) through it add a single byte to
12079                          * the EXACTish node.  But this one has changed len to
12080                          * be the correct final value, so subtract one to
12081                          * cancel out the increment that follows */
12082                         len--;
12083                     }
12084                     else {
12085                         REGC((char)ender, s++);
12086                     }
12087
12088                     /* Can get here if folding only if is one of the /l
12089                      * characters whose fold depends on the locale.  The
12090                      * occurrence of any of these indicate that we can't
12091                      * simplify things */
12092                     if (FOLD) {
12093                         maybe_exact = FALSE;
12094                         maybe_exactfu = FALSE;
12095                     }
12096                 }
12097                 else             /* FOLD */
12098                      if (! ( UTF
12099                         /* See comments for join_exact() as to why we fold this
12100                          * non-UTF at compile time */
12101                         || (node_type == EXACTFU
12102                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12103                 {
12104                     /* Here, are folding and are not UTF-8 encoded; therefore
12105                      * the character must be in the range 0-255, and is not /l
12106                      * (Not /l because we already handled these under /l in
12107                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12108                     if (IS_IN_SOME_FOLD_L1(ender)) {
12109                         maybe_exact = FALSE;
12110
12111                         /* See if the character's fold differs between /d and
12112                          * /u.  This includes the multi-char fold SHARP S to
12113                          * 'ss' */
12114                         if (maybe_exactfu
12115                             && (PL_fold[ender] != PL_fold_latin1[ender]
12116                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12117                                 || (len > 0
12118                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12119                                    && isARG2_lower_or_UPPER_ARG1('s',
12120                                                                  *(s-1)))))
12121                         {
12122                             maybe_exactfu = FALSE;
12123                         }
12124                     }
12125
12126                     /* Even when folding, we store just the input character, as
12127                      * we have an array that finds its fold quickly */
12128                     *(s++) = (char) ender;
12129                 }
12130                 else {  /* FOLD and UTF */
12131                     /* Unlike the non-fold case, we do actually have to
12132                      * calculate the results here in pass 1.  This is for two
12133                      * reasons, the folded length may be longer than the
12134                      * unfolded, and we have to calculate how many EXACTish
12135                      * nodes it will take; and we may run out of room in a node
12136                      * in the middle of a potential multi-char fold, and have
12137                      * to back off accordingly.  (Hence we can't use REGC for
12138                      * the simple case just below.) */
12139
12140                     UV folded;
12141                     if (isASCII(ender)) {
12142                         folded = toFOLD(ender);
12143                         *(s)++ = (U8) folded;
12144                     }
12145                     else {
12146                         STRLEN foldlen;
12147
12148                         folded = _to_uni_fold_flags(
12149                                      ender,
12150                                      (U8 *) s,
12151                                      &foldlen,
12152                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12153                                                         ? FOLD_FLAGS_NOMIX_ASCII
12154                                                         : 0));
12155                         s += foldlen;
12156
12157                         /* The loop increments <len> each time, as all but this
12158                          * path (and one other) through it add a single byte to
12159                          * the EXACTish node.  But this one has changed len to
12160                          * be the correct final value, so subtract one to
12161                          * cancel out the increment that follows */
12162                         len += foldlen - 1;
12163                     }
12164                     /* If this node only contains non-folding code points so
12165                      * far, see if this new one is also non-folding */
12166                     if (maybe_exact) {
12167                         if (folded != ender) {
12168                             maybe_exact = FALSE;
12169                         }
12170                         else {
12171                             /* Here the fold is the original; we have to check
12172                              * further to see if anything folds to it */
12173                             if (_invlist_contains_cp(PL_utf8_foldable,
12174                                                         ender))
12175                             {
12176                                 maybe_exact = FALSE;
12177                             }
12178                         }
12179                     }
12180                     ender = folded;
12181                 }
12182
12183                 if (next_is_quantifier) {
12184
12185                     /* Here, the next input is a quantifier, and to get here,
12186                      * the current character is the only one in the node.
12187                      * Also, here <len> doesn't include the final byte for this
12188                      * character */
12189                     len++;
12190                     goto loopdone;
12191                 }
12192
12193             } /* End of loop through literal characters */
12194
12195             /* Here we have either exhausted the input or ran out of room in
12196              * the node.  (If we encountered a character that can't be in the
12197              * node, transfer is made directly to <loopdone>, and so we
12198              * wouldn't have fallen off the end of the loop.)  In the latter
12199              * case, we artificially have to split the node into two, because
12200              * we just don't have enough space to hold everything.  This
12201              * creates a problem if the final character participates in a
12202              * multi-character fold in the non-final position, as a match that
12203              * should have occurred won't, due to the way nodes are matched,
12204              * and our artificial boundary.  So back off until we find a non-
12205              * problematic character -- one that isn't at the beginning or
12206              * middle of such a fold.  (Either it doesn't participate in any
12207              * folds, or appears only in the final position of all the folds it
12208              * does participate in.)  A better solution with far fewer false
12209              * positives, and that would fill the nodes more completely, would
12210              * be to actually have available all the multi-character folds to
12211              * test against, and to back-off only far enough to be sure that
12212              * this node isn't ending with a partial one.  <upper_parse> is set
12213              * further below (if we need to reparse the node) to include just
12214              * up through that final non-problematic character that this code
12215              * identifies, so when it is set to less than the full node, we can
12216              * skip the rest of this */
12217             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12218
12219                 const STRLEN full_len = len;
12220
12221                 assert(len >= MAX_NODE_STRING_SIZE);
12222
12223                 /* Here, <s> points to the final byte of the final character.
12224                  * Look backwards through the string until find a non-
12225                  * problematic character */
12226
12227                 if (! UTF) {
12228
12229                     /* This has no multi-char folds to non-UTF characters */
12230                     if (ASCII_FOLD_RESTRICTED) {
12231                         goto loopdone;
12232                     }
12233
12234                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12235                     len = s - s0 + 1;
12236                 }
12237                 else {
12238                     if (!  PL_NonL1NonFinalFold) {
12239                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12240                                         NonL1_Perl_Non_Final_Folds_invlist);
12241                     }
12242
12243                     /* Point to the first byte of the final character */
12244                     s = (char *) utf8_hop((U8 *) s, -1);
12245
12246                     while (s >= s0) {   /* Search backwards until find
12247                                            non-problematic char */
12248                         if (UTF8_IS_INVARIANT(*s)) {
12249
12250                             /* There are no ascii characters that participate
12251                              * in multi-char folds under /aa.  In EBCDIC, the
12252                              * non-ascii invariants are all control characters,
12253                              * so don't ever participate in any folds. */
12254                             if (ASCII_FOLD_RESTRICTED
12255                                 || ! IS_NON_FINAL_FOLD(*s))
12256                             {
12257                                 break;
12258                             }
12259                         }
12260                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12261                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12262                                                                   *s, *(s+1))))
12263                             {
12264                                 break;
12265                             }
12266                         }
12267                         else if (! _invlist_contains_cp(
12268                                         PL_NonL1NonFinalFold,
12269                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12270                         {
12271                             break;
12272                         }
12273
12274                         /* Here, the current character is problematic in that
12275                          * it does occur in the non-final position of some
12276                          * fold, so try the character before it, but have to
12277                          * special case the very first byte in the string, so
12278                          * we don't read outside the string */
12279                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12280                     } /* End of loop backwards through the string */
12281
12282                     /* If there were only problematic characters in the string,
12283                      * <s> will point to before s0, in which case the length
12284                      * should be 0, otherwise include the length of the
12285                      * non-problematic character just found */
12286                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12287                 }
12288
12289                 /* Here, have found the final character, if any, that is
12290                  * non-problematic as far as ending the node without splitting
12291                  * it across a potential multi-char fold.  <len> contains the
12292                  * number of bytes in the node up-to and including that
12293                  * character, or is 0 if there is no such character, meaning
12294                  * the whole node contains only problematic characters.  In
12295                  * this case, give up and just take the node as-is.  We can't
12296                  * do any better */
12297                 if (len == 0) {
12298                     len = full_len;
12299
12300                     /* If the node ends in an 's' we make sure it stays EXACTF,
12301                      * as if it turns into an EXACTFU, it could later get
12302                      * joined with another 's' that would then wrongly match
12303                      * the sharp s */
12304                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12305                     {
12306                         maybe_exactfu = FALSE;
12307                     }
12308                 } else {
12309
12310                     /* Here, the node does contain some characters that aren't
12311                      * problematic.  If one such is the final character in the
12312                      * node, we are done */
12313                     if (len == full_len) {
12314                         goto loopdone;
12315                     }
12316                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12317
12318                         /* If the final character is problematic, but the
12319                          * penultimate is not, back-off that last character to
12320                          * later start a new node with it */
12321                         p = oldp;
12322                         goto loopdone;
12323                     }
12324
12325                     /* Here, the final non-problematic character is earlier
12326                      * in the input than the penultimate character.  What we do
12327                      * is reparse from the beginning, going up only as far as
12328                      * this final ok one, thus guaranteeing that the node ends
12329                      * in an acceptable character.  The reason we reparse is
12330                      * that we know how far in the character is, but we don't
12331                      * know how to correlate its position with the input parse.
12332                      * An alternate implementation would be to build that
12333                      * correlation as we go along during the original parse,
12334                      * but that would entail extra work for every node, whereas
12335                      * this code gets executed only when the string is too
12336                      * large for the node, and the final two characters are
12337                      * problematic, an infrequent occurrence.  Yet another
12338                      * possible strategy would be to save the tail of the
12339                      * string, and the next time regatom is called, initialize
12340                      * with that.  The problem with this is that unless you
12341                      * back off one more character, you won't be guaranteed
12342                      * regatom will get called again, unless regbranch,
12343                      * regpiece ... are also changed.  If you do back off that
12344                      * extra character, so that there is input guaranteed to
12345                      * force calling regatom, you can't handle the case where
12346                      * just the first character in the node is acceptable.  I
12347                      * (khw) decided to try this method which doesn't have that
12348                      * pitfall; if performance issues are found, we can do a
12349                      * combination of the current approach plus that one */
12350                     upper_parse = len;
12351                     len = 0;
12352                     s = s0;
12353                     goto reparse;
12354                 }
12355             }   /* End of verifying node ends with an appropriate char */
12356
12357         loopdone:   /* Jumped to when encounters something that shouldn't be in
12358                        the node */
12359
12360             /* I (khw) don't know if you can get here with zero length, but the
12361              * old code handled this situation by creating a zero-length EXACT
12362              * node.  Might as well be NOTHING instead */
12363             if (len == 0) {
12364                 OP(ret) = NOTHING;
12365             }
12366             else {
12367                 if (FOLD) {
12368                     /* If 'maybe_exact' is still set here, means there are no
12369                      * code points in the node that participate in folds;
12370                      * similarly for 'maybe_exactfu' and code points that match
12371                      * differently depending on UTF8ness of the target string
12372                      * (for /u), or depending on locale for /l */
12373                     if (maybe_exact) {
12374                         OP(ret) = EXACT;
12375                     }
12376                     else if (maybe_exactfu) {
12377                         OP(ret) = EXACTFU;
12378                     }
12379                 }
12380                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12381                                            FALSE /* Don't look to see if could
12382                                                     be turned into an EXACT
12383                                                     node, as we have already
12384                                                     computed that */
12385                                           );
12386             }
12387
12388             RExC_parse = p - 1;
12389             Set_Node_Cur_Length(ret, parse_start);
12390             nextchar(pRExC_state);
12391             {
12392                 /* len is STRLEN which is unsigned, need to copy to signed */
12393                 IV iv = len;
12394                 if (iv < 0)
12395                     vFAIL("Internal disaster");
12396             }
12397
12398         } /* End of label 'defchar:' */
12399         break;
12400     } /* End of giant switch on input character */
12401
12402     return(ret);
12403 }
12404
12405 STATIC char *
12406 S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12407 {
12408     /* Returns the next non-pattern-white space, non-comment character (the
12409      * latter only if 'recognize_comment is true) in the string p, which is
12410      * ended by RExC_end.  See also reg_skipcomment */
12411     const char *e = RExC_end;
12412
12413     PERL_ARGS_ASSERT_REGPATWS;
12414
12415     while (p < e) {
12416         STRLEN len;
12417         if ((len = is_PATWS_safe(p, e, UTF))) {
12418             p += len;
12419         }
12420         else if (recognize_comment && *p == '#') {
12421             p = reg_skipcomment(pRExC_state, p);
12422         }
12423         else
12424             break;
12425     }
12426     return p;
12427 }
12428
12429 STATIC void
12430 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12431 {
12432     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12433      * sets up the bitmap and any flags, removing those code points from the
12434      * inversion list, setting it to NULL should it become completely empty */
12435
12436     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12437     assert(PL_regkind[OP(node)] == ANYOF);
12438
12439     ANYOF_BITMAP_ZERO(node);
12440     if (*invlist_ptr) {
12441
12442         /* This gets set if we actually need to modify things */
12443         bool change_invlist = FALSE;
12444
12445         UV start, end;
12446
12447         /* Start looking through *invlist_ptr */
12448         invlist_iterinit(*invlist_ptr);
12449         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12450             UV high;
12451             int i;
12452
12453             if (end == UV_MAX && start <= 256) {
12454                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12455             }
12456             else if (end >= 256) {
12457                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12458             }
12459
12460             /* Quit if are above what we should change */
12461             if (start > 255) {
12462                 break;
12463             }
12464
12465             change_invlist = TRUE;
12466
12467             /* Set all the bits in the range, up to the max that we are doing */
12468             high = (end < 255) ? end : 255;
12469             for (i = start; i <= (int) high; i++) {
12470                 if (! ANYOF_BITMAP_TEST(node, i)) {
12471                     ANYOF_BITMAP_SET(node, i);
12472                 }
12473             }
12474         }
12475         invlist_iterfinish(*invlist_ptr);
12476
12477         /* Done with loop; remove any code points that are in the bitmap from
12478          * *invlist_ptr; similarly for code points above latin1 if we have a
12479          * flag to match all of them anyways */
12480         if (change_invlist) {
12481             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12482         }
12483         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12484             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12485         }
12486
12487         /* If have completely emptied it, remove it completely */
12488         if (_invlist_len(*invlist_ptr) == 0) {
12489             SvREFCNT_dec_NN(*invlist_ptr);
12490             *invlist_ptr = NULL;
12491         }
12492     }
12493 }
12494
12495 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12496    Character classes ([:foo:]) can also be negated ([:^foo:]).
12497    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12498    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12499    but trigger failures because they are currently unimplemented. */
12500
12501 #define POSIXCC_DONE(c)   ((c) == ':')
12502 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12503 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12504
12505 PERL_STATIC_INLINE I32
12506 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12507 {
12508     dVAR;
12509     I32 namedclass = OOB_NAMEDCLASS;
12510
12511     PERL_ARGS_ASSERT_REGPPOSIXCC;
12512
12513     if (value == '[' && RExC_parse + 1 < RExC_end &&
12514         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12515         POSIXCC(UCHARAT(RExC_parse)))
12516     {
12517         const char c = UCHARAT(RExC_parse);
12518         char* const s = RExC_parse++;
12519
12520         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12521             RExC_parse++;
12522         if (RExC_parse == RExC_end) {
12523             if (strict) {
12524
12525                 /* Try to give a better location for the error (than the end of
12526                  * the string) by looking for the matching ']' */
12527                 RExC_parse = s;
12528                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12529                     RExC_parse++;
12530                 }
12531                 vFAIL2("Unmatched '%c' in POSIX class", c);
12532             }
12533             /* Grandfather lone [:, [=, [. */
12534             RExC_parse = s;
12535         }
12536         else {
12537             const char* const t = RExC_parse++; /* skip over the c */
12538             assert(*t == c);
12539
12540             if (UCHARAT(RExC_parse) == ']') {
12541                 const char *posixcc = s + 1;
12542                 RExC_parse++; /* skip over the ending ] */
12543
12544                 if (*s == ':') {
12545                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12546                     const I32 skip = t - posixcc;
12547
12548                     /* Initially switch on the length of the name.  */
12549                     switch (skip) {
12550                     case 4:
12551                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12552                                                           this is the Perl \w
12553                                                         */
12554                             namedclass = ANYOF_WORDCHAR;
12555                         break;
12556                     case 5:
12557                         /* Names all of length 5.  */
12558                         /* alnum alpha ascii blank cntrl digit graph lower
12559                            print punct space upper  */
12560                         /* Offset 4 gives the best switch position.  */
12561                         switch (posixcc[4]) {
12562                         case 'a':
12563                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12564                                 namedclass = ANYOF_ALPHA;
12565                             break;
12566                         case 'e':
12567                             if (memEQ(posixcc, "spac", 4)) /* space */
12568                                 namedclass = ANYOF_PSXSPC;
12569                             break;
12570                         case 'h':
12571                             if (memEQ(posixcc, "grap", 4)) /* graph */
12572                                 namedclass = ANYOF_GRAPH;
12573                             break;
12574                         case 'i':
12575                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12576                                 namedclass = ANYOF_ASCII;
12577                             break;
12578                         case 'k':
12579                             if (memEQ(posixcc, "blan", 4)) /* blank */
12580                                 namedclass = ANYOF_BLANK;
12581                             break;
12582                         case 'l':
12583                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12584                                 namedclass = ANYOF_CNTRL;
12585                             break;
12586                         case 'm':
12587                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12588                                 namedclass = ANYOF_ALPHANUMERIC;
12589                             break;
12590                         case 'r':
12591                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12592                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12593                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12594                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12595                             break;
12596                         case 't':
12597                             if (memEQ(posixcc, "digi", 4)) /* digit */
12598                                 namedclass = ANYOF_DIGIT;
12599                             else if (memEQ(posixcc, "prin", 4)) /* print */
12600                                 namedclass = ANYOF_PRINT;
12601                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12602                                 namedclass = ANYOF_PUNCT;
12603                             break;
12604                         }
12605                         break;
12606                     case 6:
12607                         if (memEQ(posixcc, "xdigit", 6))
12608                             namedclass = ANYOF_XDIGIT;
12609                         break;
12610                     }
12611
12612                     if (namedclass == OOB_NAMEDCLASS)
12613                         vFAIL2utf8f(
12614                             "POSIX class [:%"UTF8f":] unknown",
12615                             UTF8fARG(UTF, t - s - 1, s + 1));
12616
12617                     /* The #defines are structured so each complement is +1 to
12618                      * the normal one */
12619                     if (complement) {
12620                         namedclass++;
12621                     }
12622                     assert (posixcc[skip] == ':');
12623                     assert (posixcc[skip+1] == ']');
12624                 } else if (!SIZE_ONLY) {
12625                     /* [[=foo=]] and [[.foo.]] are still future. */
12626
12627                     /* adjust RExC_parse so the warning shows after
12628                        the class closes */
12629                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12630                         RExC_parse++;
12631                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12632                 }
12633             } else {
12634                 /* Maternal grandfather:
12635                  * "[:" ending in ":" but not in ":]" */
12636                 if (strict) {
12637                     vFAIL("Unmatched '[' in POSIX class");
12638                 }
12639
12640                 /* Grandfather lone [:, [=, [. */
12641                 RExC_parse = s;
12642             }
12643         }
12644     }
12645
12646     return namedclass;
12647 }
12648
12649 STATIC bool
12650 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12651 {
12652     /* This applies some heuristics at the current parse position (which should
12653      * be at a '[') to see if what follows might be intended to be a [:posix:]
12654      * class.  It returns true if it really is a posix class, of course, but it
12655      * also can return true if it thinks that what was intended was a posix
12656      * class that didn't quite make it.
12657      *
12658      * It will return true for
12659      *      [:alphanumerics:
12660      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12661      *                         ')' indicating the end of the (?[
12662      *      [:any garbage including %^&$ punctuation:]
12663      *
12664      * This is designed to be called only from S_handle_regex_sets; it could be
12665      * easily adapted to be called from the spot at the beginning of regclass()
12666      * that checks to see in a normal bracketed class if the surrounding []
12667      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12668      * change long-standing behavior, so I (khw) didn't do that */
12669     char* p = RExC_parse + 1;
12670     char first_char = *p;
12671
12672     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12673
12674     assert(*(p - 1) == '[');
12675
12676     if (! POSIXCC(first_char)) {
12677         return FALSE;
12678     }
12679
12680     p++;
12681     while (p < RExC_end && isWORDCHAR(*p)) p++;
12682
12683     if (p >= RExC_end) {
12684         return FALSE;
12685     }
12686
12687     if (p - RExC_parse > 2    /* Got at least 1 word character */
12688         && (*p == first_char
12689             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12690     {
12691         return TRUE;
12692     }
12693
12694     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12695
12696     return (p
12697             && p - RExC_parse > 2 /* [:] evaluates to colon;
12698                                       [::] is a bad posix class. */
12699             && first_char == *(p - 1));
12700 }
12701
12702 STATIC regnode *
12703 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12704                     I32 *flagp, U32 depth,
12705                     char * const oregcomp_parse)
12706 {
12707     /* Handle the (?[...]) construct to do set operations */
12708
12709     U8 curchar;
12710     UV start, end;      /* End points of code point ranges */
12711     SV* result_string;
12712     char *save_end, *save_parse;
12713     SV* final;
12714     STRLEN len;
12715     regnode* node;
12716     AV* stack;
12717     const bool save_fold = FOLD;
12718
12719     GET_RE_DEBUG_FLAGS_DECL;
12720
12721     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12722
12723     if (LOC) {
12724         vFAIL("(?[...]) not valid in locale");
12725     }
12726     RExC_uni_semantics = 1;
12727
12728     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12729      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12730      * call regclass to handle '[]' so as to not have to reinvent its parsing
12731      * rules here (throwing away the size it computes each time).  And, we exit
12732      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12733      * these things, we need to realize that something preceded by a backslash
12734      * is escaped, so we have to keep track of backslashes */
12735     if (SIZE_ONLY) {
12736         UV depth = 0; /* how many nested (?[...]) constructs */
12737
12738         Perl_ck_warner_d(aTHX_
12739             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12740             "The regex_sets feature is experimental" REPORT_LOCATION,
12741                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12742                 UTF8fARG(UTF,
12743                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12744                          RExC_precomp + (RExC_parse - RExC_precomp)));
12745
12746         while (RExC_parse < RExC_end) {
12747             SV* current = NULL;
12748             RExC_parse = regpatws(pRExC_state, RExC_parse,
12749                                           TRUE); /* means recognize comments */
12750             switch (*RExC_parse) {
12751                 case '?':
12752                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12753                     /* FALLTHROUGH */
12754                 default:
12755                     break;
12756                 case '\\':
12757                     /* Skip the next byte (which could cause us to end up in
12758                      * the middle of a UTF-8 character, but since none of those
12759                      * are confusable with anything we currently handle in this
12760                      * switch (invariants all), it's safe.  We'll just hit the
12761                      * default: case next time and keep on incrementing until
12762                      * we find one of the invariants we do handle. */
12763                     RExC_parse++;
12764                     break;
12765                 case '[':
12766                 {
12767                     /* If this looks like it is a [:posix:] class, leave the
12768                      * parse pointer at the '[' to fool regclass() into
12769                      * thinking it is part of a '[[:posix:]]'.  That function
12770                      * will use strict checking to force a syntax error if it
12771                      * doesn't work out to a legitimate class */
12772                     bool is_posix_class
12773                                     = could_it_be_a_POSIX_class(pRExC_state);
12774                     if (! is_posix_class) {
12775                         RExC_parse++;
12776                     }
12777
12778                     /* regclass() can only return RESTART_UTF8 if multi-char
12779                        folds are allowed.  */
12780                     if (!regclass(pRExC_state, flagp,depth+1,
12781                                   is_posix_class, /* parse the whole char
12782                                                      class only if not a
12783                                                      posix class */
12784                                   FALSE, /* don't allow multi-char folds */
12785                                   TRUE, /* silence non-portable warnings. */
12786                                   &current))
12787                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12788                               (UV) *flagp);
12789
12790                     /* function call leaves parse pointing to the ']', except
12791                      * if we faked it */
12792                     if (is_posix_class) {
12793                         RExC_parse--;
12794                     }
12795
12796                     SvREFCNT_dec(current);   /* In case it returned something */
12797                     break;
12798                 }
12799
12800                 case ']':
12801                     if (depth--) break;
12802                     RExC_parse++;
12803                     if (RExC_parse < RExC_end
12804                         && *RExC_parse == ')')
12805                     {
12806                         node = reganode(pRExC_state, ANYOF, 0);
12807                         RExC_size += ANYOF_SKIP;
12808                         nextchar(pRExC_state);
12809                         Set_Node_Length(node,
12810                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12811                         return node;
12812                     }
12813                     goto no_close;
12814             }
12815             RExC_parse++;
12816         }
12817
12818         no_close:
12819         FAIL("Syntax error in (?[...])");
12820     }
12821
12822     /* Pass 2 only after this.  Everything in this construct is a
12823      * metacharacter.  Operands begin with either a '\' (for an escape
12824      * sequence), or a '[' for a bracketed character class.  Any other
12825      * character should be an operator, or parenthesis for grouping.  Both
12826      * types of operands are handled by calling regclass() to parse them.  It
12827      * is called with a parameter to indicate to return the computed inversion
12828      * list.  The parsing here is implemented via a stack.  Each entry on the
12829      * stack is a single character representing one of the operators, or the
12830      * '('; or else a pointer to an operand inversion list. */
12831
12832 #define IS_OPERAND(a)  (! SvIOK(a))
12833
12834     /* The stack starts empty.  It is a syntax error if the first thing parsed
12835      * is a binary operator; everything else is pushed on the stack.  When an
12836      * operand is parsed, the top of the stack is examined.  If it is a binary
12837      * operator, the item before it should be an operand, and both are replaced
12838      * by the result of doing that operation on the new operand and the one on
12839      * the stack.   Thus a sequence of binary operands is reduced to a single
12840      * one before the next one is parsed.
12841      *
12842      * A unary operator may immediately follow a binary in the input, for
12843      * example
12844      *      [a] + ! [b]
12845      * When an operand is parsed and the top of the stack is a unary operator,
12846      * the operation is performed, and then the stack is rechecked to see if
12847      * this new operand is part of a binary operation; if so, it is handled as
12848      * above.
12849      *
12850      * A '(' is simply pushed on the stack; it is valid only if the stack is
12851      * empty, or the top element of the stack is an operator or another '('
12852      * (for which the parenthesized expression will become an operand).  By the
12853      * time the corresponding ')' is parsed everything in between should have
12854      * been parsed and evaluated to a single operand (or else is a syntax
12855      * error), and is handled as a regular operand */
12856
12857     sv_2mortal((SV *)(stack = newAV()));
12858
12859     while (RExC_parse < RExC_end) {
12860         I32 top_index = av_tindex(stack);
12861         SV** top_ptr;
12862         SV* current = NULL;
12863
12864         /* Skip white space */
12865         RExC_parse = regpatws(pRExC_state, RExC_parse,
12866                                          TRUE /* means recognize comments */ );
12867         if (RExC_parse >= RExC_end) {
12868             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12869         }
12870         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12871             break;
12872         }
12873
12874         switch (curchar) {
12875
12876             case '?':
12877                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12878                                                safely subtract 1 from
12879                                                RExC_parse in the next clause.
12880                                                If we have something on the
12881                                                stack, we have parsed something
12882                                              */
12883                     && UCHARAT(RExC_parse - 1) == '('
12884                     && RExC_parse < RExC_end)
12885                 {
12886                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12887                      * This happens when we have some thing like
12888                      *
12889                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12890                      *   ...
12891                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12892                      *
12893                      * Here we would be handling the interpolated
12894                      * '$thai_or_lao'.  We handle this by a recursive call to
12895                      * ourselves which returns the inversion list the
12896                      * interpolated expression evaluates to.  We use the flags
12897                      * from the interpolated pattern. */
12898                     U32 save_flags = RExC_flags;
12899                     const char * const save_parse = ++RExC_parse;
12900
12901                     parse_lparen_question_flags(pRExC_state);
12902
12903                     if (RExC_parse == save_parse  /* Makes sure there was at
12904                                                      least one flag (or this
12905                                                      embedding wasn't compiled)
12906                                                    */
12907                         || RExC_parse >= RExC_end - 4
12908                         || UCHARAT(RExC_parse) != ':'
12909                         || UCHARAT(++RExC_parse) != '('
12910                         || UCHARAT(++RExC_parse) != '?'
12911                         || UCHARAT(++RExC_parse) != '[')
12912                     {
12913
12914                         /* In combination with the above, this moves the
12915                          * pointer to the point just after the first erroneous
12916                          * character (or if there are no flags, to where they
12917                          * should have been) */
12918                         if (RExC_parse >= RExC_end - 4) {
12919                             RExC_parse = RExC_end;
12920                         }
12921                         else if (RExC_parse != save_parse) {
12922                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12923                         }
12924                         vFAIL("Expecting '(?flags:(?[...'");
12925                     }
12926                     RExC_parse++;
12927                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12928                                                     depth+1, oregcomp_parse);
12929
12930                     /* Here, 'current' contains the embedded expression's
12931                      * inversion list, and RExC_parse points to the trailing
12932                      * ']'; the next character should be the ')' which will be
12933                      * paired with the '(' that has been put on the stack, so
12934                      * the whole embedded expression reduces to '(operand)' */
12935                     RExC_parse++;
12936
12937                     RExC_flags = save_flags;
12938                     goto handle_operand;
12939                 }
12940                 /* FALLTHROUGH */
12941
12942             default:
12943                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12944                 vFAIL("Unexpected character");
12945
12946             case '\\':
12947                 /* regclass() can only return RESTART_UTF8 if multi-char
12948                    folds are allowed.  */
12949                 if (!regclass(pRExC_state, flagp,depth+1,
12950                               TRUE, /* means parse just the next thing */
12951                               FALSE, /* don't allow multi-char folds */
12952                               FALSE, /* don't silence non-portable warnings.  */
12953                               &current))
12954                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12955                           (UV) *flagp);
12956                 /* regclass() will return with parsing just the \ sequence,
12957                  * leaving the parse pointer at the next thing to parse */
12958                 RExC_parse--;
12959                 goto handle_operand;
12960
12961             case '[':   /* Is a bracketed character class */
12962             {
12963                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12964
12965                 if (! is_posix_class) {
12966                     RExC_parse++;
12967                 }
12968
12969                 /* regclass() can only return RESTART_UTF8 if multi-char
12970                    folds are allowed.  */
12971                 if(!regclass(pRExC_state, flagp,depth+1,
12972                              is_posix_class, /* parse the whole char class
12973                                                 only if not a posix class */
12974                              FALSE, /* don't allow multi-char folds */
12975                              FALSE, /* don't silence non-portable warnings.  */
12976                              &current))
12977                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12978                           (UV) *flagp);
12979                 /* function call leaves parse pointing to the ']', except if we
12980                  * faked it */
12981                 if (is_posix_class) {
12982                     RExC_parse--;
12983                 }
12984
12985                 goto handle_operand;
12986             }
12987
12988             case '&':
12989             case '|':
12990             case '+':
12991             case '-':
12992             case '^':
12993                 if (top_index < 0
12994                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12995                     || ! IS_OPERAND(*top_ptr))
12996                 {
12997                     RExC_parse++;
12998                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12999                 }
13000                 av_push(stack, newSVuv(curchar));
13001                 break;
13002
13003             case '!':
13004                 av_push(stack, newSVuv(curchar));
13005                 break;
13006
13007             case '(':
13008                 if (top_index >= 0) {
13009                     top_ptr = av_fetch(stack, top_index, FALSE);
13010                     assert(top_ptr);
13011                     if (IS_OPERAND(*top_ptr)) {
13012                         RExC_parse++;
13013                         vFAIL("Unexpected '(' with no preceding operator");
13014                     }
13015                 }
13016                 av_push(stack, newSVuv(curchar));
13017                 break;
13018
13019             case ')':
13020             {
13021                 SV* lparen;
13022                 if (top_index < 1
13023                     || ! (current = av_pop(stack))
13024                     || ! IS_OPERAND(current)
13025                     || ! (lparen = av_pop(stack))
13026                     || IS_OPERAND(lparen)
13027                     || SvUV(lparen) != '(')
13028                 {
13029                     SvREFCNT_dec(current);
13030                     RExC_parse++;
13031                     vFAIL("Unexpected ')'");
13032                 }
13033                 top_index -= 2;
13034                 SvREFCNT_dec_NN(lparen);
13035
13036                 /* FALLTHROUGH */
13037             }
13038
13039               handle_operand:
13040
13041                 /* Here, we have an operand to process, in 'current' */
13042
13043                 if (top_index < 0) {    /* Just push if stack is empty */
13044                     av_push(stack, current);
13045                 }
13046                 else {
13047                     SV* top = av_pop(stack);
13048                     SV *prev = NULL;
13049                     char current_operator;
13050
13051                     if (IS_OPERAND(top)) {
13052                         SvREFCNT_dec_NN(top);
13053                         SvREFCNT_dec_NN(current);
13054                         vFAIL("Operand with no preceding operator");
13055                     }
13056                     current_operator = (char) SvUV(top);
13057                     switch (current_operator) {
13058                         case '(':   /* Push the '(' back on followed by the new
13059                                        operand */
13060                             av_push(stack, top);
13061                             av_push(stack, current);
13062                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13063                                                    just after the 'break', so
13064                                                    it doesn't get wrongly freed
13065                                                  */
13066                             break;
13067
13068                         case '!':
13069                             _invlist_invert(current);
13070
13071                             /* Unlike binary operators, the top of the stack,
13072                              * now that this unary one has been popped off, may
13073                              * legally be an operator, and we now have operand
13074                              * for it. */
13075                             top_index--;
13076                             SvREFCNT_dec_NN(top);
13077                             goto handle_operand;
13078
13079                         case '&':
13080                             prev = av_pop(stack);
13081                             _invlist_intersection(prev,
13082                                                    current,
13083                                                    &current);
13084                             av_push(stack, current);
13085                             break;
13086
13087                         case '|':
13088                         case '+':
13089                             prev = av_pop(stack);
13090                             _invlist_union(prev, current, &current);
13091                             av_push(stack, current);
13092                             break;
13093
13094                         case '-':
13095                             prev = av_pop(stack);;
13096                             _invlist_subtract(prev, current, &current);
13097                             av_push(stack, current);
13098                             break;
13099
13100                         case '^':   /* The union minus the intersection */
13101                         {
13102                             SV* i = NULL;
13103                             SV* u = NULL;
13104                             SV* element;
13105
13106                             prev = av_pop(stack);
13107                             _invlist_union(prev, current, &u);
13108                             _invlist_intersection(prev, current, &i);
13109                             /* _invlist_subtract will overwrite current
13110                                 without freeing what it already contains */
13111                             element = current;
13112                             _invlist_subtract(u, i, &current);
13113                             av_push(stack, current);
13114                             SvREFCNT_dec_NN(i);
13115                             SvREFCNT_dec_NN(u);
13116                             SvREFCNT_dec_NN(element);
13117                             break;
13118                         }
13119
13120                         default:
13121                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13122                 }
13123                 SvREFCNT_dec_NN(top);
13124                 SvREFCNT_dec(prev);
13125             }
13126         }
13127
13128         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13129     }
13130
13131     if (av_tindex(stack) < 0   /* Was empty */
13132         || ((final = av_pop(stack)) == NULL)
13133         || ! IS_OPERAND(final)
13134         || av_tindex(stack) >= 0)  /* More left on stack */
13135     {
13136         vFAIL("Incomplete expression within '(?[ ])'");
13137     }
13138
13139     /* Here, 'final' is the resultant inversion list from evaluating the
13140      * expression.  Return it if so requested */
13141     if (return_invlist) {
13142         *return_invlist = final;
13143         return END;
13144     }
13145
13146     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13147      * expecting a string of ranges and individual code points */
13148     invlist_iterinit(final);
13149     result_string = newSVpvs("");
13150     while (invlist_iternext(final, &start, &end)) {
13151         if (start == end) {
13152             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13153         }
13154         else {
13155             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13156                                                      start,          end);
13157         }
13158     }
13159
13160     save_parse = RExC_parse;
13161     RExC_parse = SvPV(result_string, len);
13162     save_end = RExC_end;
13163     RExC_end = RExC_parse + len;
13164
13165     /* We turn off folding around the call, as the class we have constructed
13166      * already has all folding taken into consideration, and we don't want
13167      * regclass() to add to that */
13168     RExC_flags &= ~RXf_PMf_FOLD;
13169     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13170      */
13171     node = regclass(pRExC_state, flagp,depth+1,
13172                     FALSE, /* means parse the whole char class */
13173                     FALSE, /* don't allow multi-char folds */
13174                     TRUE, /* silence non-portable warnings.  The above may very
13175                              well have generated non-portable code points, but
13176                              they're valid on this machine */
13177                     NULL);
13178     if (!node)
13179         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13180                     PTR2UV(flagp));
13181     if (save_fold) {
13182         RExC_flags |= RXf_PMf_FOLD;
13183     }
13184     RExC_parse = save_parse + 1;
13185     RExC_end = save_end;
13186     SvREFCNT_dec_NN(final);
13187     SvREFCNT_dec_NN(result_string);
13188
13189     nextchar(pRExC_state);
13190     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13191     return node;
13192 }
13193 #undef IS_OPERAND
13194
13195 STATIC void
13196 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13197 {
13198     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13199      * innocent-looking character class, like /[ks]/i won't have to go out to
13200      * disk to find the possible matches.
13201      *
13202      * This should be called only for a Latin1-range code points, cp, which is
13203      * known to be involved in a fold with other code points above Latin1.  It
13204      * would give false results if /aa has been specified.  Multi-char folds
13205      * are outside the scope of this, and must be handled specially.
13206      *
13207      * XXX It would be better to generate these via regen, in case a new
13208      * version of the Unicode standard adds new mappings, though that is not
13209      * really likely, and may be caught by the default: case of the switch
13210      * below. */
13211
13212     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13213
13214     switch (cp) {
13215         case 'k':
13216         case 'K':
13217           *invlist =
13218              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13219             break;
13220         case 's':
13221         case 'S':
13222           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13223             break;
13224         case MICRO_SIGN:
13225           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13226           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13227             break;
13228         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13229         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13230           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13231             break;
13232         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13233           *invlist = add_cp_to_invlist(*invlist,
13234                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13235             break;
13236         case LATIN_SMALL_LETTER_SHARP_S:
13237           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13238             break;
13239         case 'F': case 'f':
13240         case 'I': case 'i':
13241         case 'L': case 'l':
13242         case 'T': case 't':
13243         case 'A': case 'a':
13244         case 'H': case 'h':
13245         case 'J': case 'j':
13246         case 'N': case 'n':
13247         case 'W': case 'w':
13248         case 'Y': case 'y':
13249             /* These all are targets of multi-character folds from code points
13250              * that require UTF8 to express, so they can't match unless the
13251              * target string is in UTF-8, so no action here is necessary, as
13252              * regexec.c properly handles the general case for UTF-8 matching
13253              * and multi-char folds */
13254             break;
13255         default:
13256             /* Use deprecated warning to increase the chances of this being
13257              * output */
13258             ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13259             break;
13260     }
13261 }
13262
13263 /* The names of properties whose definitions are not known at compile time are
13264  * stored in this SV, after a constant heading.  So if the length has been
13265  * changed since initialization, then there is a run-time definition. */
13266 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13267                                         (SvCUR(listsv) != initial_listsv_len)
13268
13269 STATIC regnode *
13270 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13271                  const bool stop_at_1,  /* Just parse the next thing, don't
13272                                            look for a full character class */
13273                  bool allow_multi_folds,
13274                  const bool silence_non_portable,   /* Don't output warnings
13275                                                        about too large
13276                                                        characters */
13277                  SV** ret_invlist)  /* Return an inversion list, not a node */
13278 {
13279     /* parse a bracketed class specification.  Most of these will produce an
13280      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13281      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13282      * under /i with multi-character folds: it will be rewritten following the
13283      * paradigm of this example, where the <multi-fold>s are characters which
13284      * fold to multiple character sequences:
13285      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13286      * gets effectively rewritten as:
13287      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13288      * reg() gets called (recursively) on the rewritten version, and this
13289      * function will return what it constructs.  (Actually the <multi-fold>s
13290      * aren't physically removed from the [abcdefghi], it's just that they are
13291      * ignored in the recursion by means of a flag:
13292      * <RExC_in_multi_char_class>.)
13293      *
13294      * ANYOF nodes contain a bit map for the first 256 characters, with the
13295      * corresponding bit set if that character is in the list.  For characters
13296      * above 255, a range list or swash is used.  There are extra bits for \w,
13297      * etc. in locale ANYOFs, as what these match is not determinable at
13298      * compile time
13299      *
13300      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13301      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13302      */
13303
13304     dVAR;
13305     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13306     IV range = 0;
13307     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13308     regnode *ret;
13309     STRLEN numlen;
13310     IV namedclass = OOB_NAMEDCLASS;
13311     char *rangebegin = NULL;
13312     bool need_class = 0;
13313     SV *listsv = NULL;
13314     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13315                                       than just initialized.  */
13316     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13317     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13318                                extended beyond the Latin1 range.  These have to
13319                                be kept separate from other code points for much
13320                                of this function because their handling  is
13321                                different under /i, and for most classes under
13322                                /d as well */
13323     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13324                                separate for a while from the non-complemented
13325                                versions because of complications with /d
13326                                matching */
13327     UV element_count = 0;   /* Number of distinct elements in the class.
13328                                Optimizations may be possible if this is tiny */
13329     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13330                                        character; used under /i */
13331     UV n;
13332     char * stop_ptr = RExC_end;    /* where to stop parsing */
13333     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13334                                                    space? */
13335     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13336
13337     /* Unicode properties are stored in a swash; this holds the current one
13338      * being parsed.  If this swash is the only above-latin1 component of the
13339      * character class, an optimization is to pass it directly on to the
13340      * execution engine.  Otherwise, it is set to NULL to indicate that there
13341      * are other things in the class that have to be dealt with at execution
13342      * time */
13343     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13344
13345     /* Set if a component of this character class is user-defined; just passed
13346      * on to the engine */
13347     bool has_user_defined_property = FALSE;
13348
13349     /* inversion list of code points this node matches only when the target
13350      * string is in UTF-8.  (Because is under /d) */
13351     SV* depends_list = NULL;
13352
13353     /* Inversion list of code points this node matches regardless of things
13354      * like locale, folding, utf8ness of the target string */
13355     SV* cp_list = NULL;
13356
13357     /* Like cp_list, but code points on this list need to be checked for things
13358      * that fold to/from them under /i */
13359     SV* cp_foldable_list = NULL;
13360
13361     /* Like cp_list, but code points on this list are valid only when the
13362      * runtime locale is UTF-8 */
13363     SV* only_utf8_locale_list = NULL;
13364
13365 #ifdef EBCDIC
13366     /* In a range, counts how many 0-2 of the ends of it came from literals,
13367      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13368     UV literal_endpoint = 0;
13369 #endif
13370     bool invert = FALSE;    /* Is this class to be complemented */
13371
13372     bool warn_super = ALWAYS_WARN_SUPER;
13373
13374     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13375         case we need to change the emitted regop to an EXACT. */
13376     const char * orig_parse = RExC_parse;
13377     const SSize_t orig_size = RExC_size;
13378     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13379     GET_RE_DEBUG_FLAGS_DECL;
13380
13381     PERL_ARGS_ASSERT_REGCLASS;
13382 #ifndef DEBUGGING
13383     PERL_UNUSED_ARG(depth);
13384 #endif
13385
13386     DEBUG_PARSE("clas");
13387
13388     /* Assume we are going to generate an ANYOF node. */
13389     ret = reganode(pRExC_state, ANYOF, 0);
13390
13391     if (SIZE_ONLY) {
13392         RExC_size += ANYOF_SKIP;
13393         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13394     }
13395     else {
13396         ANYOF_FLAGS(ret) = 0;
13397
13398         RExC_emit += ANYOF_SKIP;
13399         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13400         initial_listsv_len = SvCUR(listsv);
13401         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13402     }
13403
13404     if (skip_white) {
13405         RExC_parse = regpatws(pRExC_state, RExC_parse,
13406                               FALSE /* means don't recognize comments */ );
13407     }
13408
13409     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13410         RExC_parse++;
13411         invert = TRUE;
13412         allow_multi_folds = FALSE;
13413         RExC_naughty++;
13414         if (skip_white) {
13415             RExC_parse = regpatws(pRExC_state, RExC_parse,
13416                                   FALSE /* means don't recognize comments */ );
13417         }
13418     }
13419
13420     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13421     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13422         const char *s = RExC_parse;
13423         const char  c = *s++;
13424
13425         while (isWORDCHAR(*s))
13426             s++;
13427         if (*s && c == *s && s[1] == ']') {
13428             SAVEFREESV(RExC_rx_sv);
13429             ckWARN3reg(s+2,
13430                        "POSIX syntax [%c %c] belongs inside character classes",
13431                        c, c);
13432             (void)ReREFCNT_inc(RExC_rx_sv);
13433         }
13434     }
13435
13436     /* If the caller wants us to just parse a single element, accomplish this
13437      * by faking the loop ending condition */
13438     if (stop_at_1 && RExC_end > RExC_parse) {
13439         stop_ptr = RExC_parse + 1;
13440     }
13441
13442     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13443     if (UCHARAT(RExC_parse) == ']')
13444         goto charclassloop;
13445
13446 parseit:
13447     while (1) {
13448         if  (RExC_parse >= stop_ptr) {
13449             break;
13450         }
13451
13452         if (skip_white) {
13453             RExC_parse = regpatws(pRExC_state, RExC_parse,
13454                                   FALSE /* means don't recognize comments */ );
13455         }
13456
13457         if  (UCHARAT(RExC_parse) == ']') {
13458             break;
13459         }
13460
13461     charclassloop:
13462
13463         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13464         save_value = value;
13465         save_prevvalue = prevvalue;
13466
13467         if (!range) {
13468             rangebegin = RExC_parse;
13469             element_count++;
13470         }
13471         if (UTF) {
13472             value = utf8n_to_uvchr((U8*)RExC_parse,
13473                                    RExC_end - RExC_parse,
13474                                    &numlen, UTF8_ALLOW_DEFAULT);
13475             RExC_parse += numlen;
13476         }
13477         else
13478             value = UCHARAT(RExC_parse++);
13479
13480         if (value == '['
13481             && RExC_parse < RExC_end
13482             && POSIXCC(UCHARAT(RExC_parse)))
13483         {
13484             namedclass = regpposixcc(pRExC_state, value, strict);
13485         }
13486         else if (value == '\\') {
13487             if (UTF) {
13488                 value = utf8n_to_uvchr((U8*)RExC_parse,
13489                                    RExC_end - RExC_parse,
13490                                    &numlen, UTF8_ALLOW_DEFAULT);
13491                 RExC_parse += numlen;
13492             }
13493             else
13494                 value = UCHARAT(RExC_parse++);
13495
13496             /* Some compilers cannot handle switching on 64-bit integer
13497              * values, therefore value cannot be an UV.  Yes, this will
13498              * be a problem later if we want switch on Unicode.
13499              * A similar issue a little bit later when switching on
13500              * namedclass. --jhi */
13501
13502             /* If the \ is escaping white space when white space is being
13503              * skipped, it means that that white space is wanted literally, and
13504              * is already in 'value'.  Otherwise, need to translate the escape
13505              * into what it signifies. */
13506             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13507
13508             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13509             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13510             case 's':   namedclass = ANYOF_SPACE;       break;
13511             case 'S':   namedclass = ANYOF_NSPACE;      break;
13512             case 'd':   namedclass = ANYOF_DIGIT;       break;
13513             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13514             case 'v':   namedclass = ANYOF_VERTWS;      break;
13515             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13516             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13517             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13518             case 'N':  /* Handle \N{NAME} in class */
13519                 {
13520                     /* We only pay attention to the first char of
13521                     multichar strings being returned. I kinda wonder
13522                     if this makes sense as it does change the behaviour
13523                     from earlier versions, OTOH that behaviour was broken
13524                     as well. */
13525                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13526                                       TRUE, /* => charclass */
13527                                       strict))
13528                     {
13529                         if (*flagp & RESTART_UTF8)
13530                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13531                         goto parseit;
13532                     }
13533                 }
13534                 break;
13535             case 'p':
13536             case 'P':
13537                 {
13538                 char *e;
13539
13540                 /* We will handle any undefined properties ourselves */
13541                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13542                                        /* And we actually would prefer to get
13543                                         * the straight inversion list of the
13544                                         * swash, since we will be accessing it
13545                                         * anyway, to save a little time */
13546                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13547
13548                 if (RExC_parse >= RExC_end)
13549                     vFAIL2("Empty \\%c{}", (U8)value);
13550                 if (*RExC_parse == '{') {
13551                     const U8 c = (U8)value;
13552                     e = strchr(RExC_parse++, '}');
13553                     if (!e)
13554                         vFAIL2("Missing right brace on \\%c{}", c);
13555                     while (isSPACE(*RExC_parse))
13556                         RExC_parse++;
13557                     if (e == RExC_parse)
13558                         vFAIL2("Empty \\%c{}", c);
13559                     n = e - RExC_parse;
13560                     while (isSPACE(*(RExC_parse + n - 1)))
13561                         n--;
13562                 }
13563                 else {
13564                     e = RExC_parse;
13565                     n = 1;
13566                 }
13567                 if (!SIZE_ONLY) {
13568                     SV* invlist;
13569                     char* name;
13570
13571                     if (UCHARAT(RExC_parse) == '^') {
13572                          RExC_parse++;
13573                          n--;
13574                          /* toggle.  (The rhs xor gets the single bit that
13575                           * differs between P and p; the other xor inverts just
13576                           * that bit) */
13577                          value ^= 'P' ^ 'p';
13578
13579                          while (isSPACE(*RExC_parse)) {
13580                               RExC_parse++;
13581                               n--;
13582                          }
13583                     }
13584                     /* Try to get the definition of the property into
13585                      * <invlist>.  If /i is in effect, the effective property
13586                      * will have its name be <__NAME_i>.  The design is
13587                      * discussed in commit
13588                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13589                     name = savepv(Perl_form(aTHX_
13590                                           "%s%.*s%s\n",
13591                                           (FOLD) ? "__" : "",
13592                                           (int)n,
13593                                           RExC_parse,
13594                                           (FOLD) ? "_i" : ""
13595                                 ));
13596
13597                     /* Look up the property name, and get its swash and
13598                      * inversion list, if the property is found  */
13599                     if (swash) {
13600                         SvREFCNT_dec_NN(swash);
13601                     }
13602                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13603                                              1, /* binary */
13604                                              0, /* not tr/// */
13605                                              NULL, /* No inversion list */
13606                                              &swash_init_flags
13607                                             );
13608                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13609                         if (swash) {
13610                             SvREFCNT_dec_NN(swash);
13611                             swash = NULL;
13612                         }
13613
13614                         /* Here didn't find it.  It could be a user-defined
13615                          * property that will be available at run-time.  If we
13616                          * accept only compile-time properties, is an error;
13617                          * otherwise add it to the list for run-time look up */
13618                         if (ret_invlist) {
13619                             RExC_parse = e + 1;
13620                             vFAIL2utf8f(
13621                                 "Property '%"UTF8f"' is unknown",
13622                                 UTF8fARG(UTF, n, name));
13623                         }
13624
13625                         /* If the property name doesn't already have a package
13626                          * name, add the current one to it so that it can be
13627                          * referred to outside it. [perl #121777] */
13628                         if (! instr(name, "::") && PL_curstash) {
13629                             char* full_name = Perl_form(aTHX_
13630                                                         "%s::%s",
13631                                                         HvNAME(PL_curstash),
13632                                                         name);
13633                             n = strlen(full_name);
13634                             Safefree(name);
13635                             name = savepvn(full_name, n);
13636                         }
13637                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13638                                         (value == 'p' ? '+' : '!'),
13639                                         UTF8fARG(UTF, n, name));
13640                         has_user_defined_property = TRUE;
13641
13642                         /* We don't know yet, so have to assume that the
13643                          * property could match something in the Latin1 range,
13644                          * hence something that isn't utf8.  Note that this
13645                          * would cause things in <depends_list> to match
13646                          * inappropriately, except that any \p{}, including
13647                          * this one forces Unicode semantics, which means there
13648                          * is no <depends_list> */
13649                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13650                     }
13651                     else {
13652
13653                         /* Here, did get the swash and its inversion list.  If
13654                          * the swash is from a user-defined property, then this
13655                          * whole character class should be regarded as such */
13656                         if (swash_init_flags
13657                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13658                         {
13659                             has_user_defined_property = TRUE;
13660                         }
13661                         else if
13662                             /* We warn on matching an above-Unicode code point
13663                              * if the match would return true, except don't
13664                              * warn for \p{All}, which has exactly one element
13665                              * = 0 */
13666                             (_invlist_contains_cp(invlist, 0x110000)
13667                                 && (! (_invlist_len(invlist) == 1
13668                                        && *invlist_array(invlist) == 0)))
13669                         {
13670                             warn_super = TRUE;
13671                         }
13672
13673
13674                         /* Invert if asking for the complement */
13675                         if (value == 'P') {
13676                             _invlist_union_complement_2nd(properties,
13677                                                           invlist,
13678                                                           &properties);
13679
13680                             /* The swash can't be used as-is, because we've
13681                              * inverted things; delay removing it to here after
13682                              * have copied its invlist above */
13683                             SvREFCNT_dec_NN(swash);
13684                             swash = NULL;
13685                         }
13686                         else {
13687                             _invlist_union(properties, invlist, &properties);
13688                         }
13689                     }
13690                     Safefree(name);
13691                 }
13692                 RExC_parse = e + 1;
13693                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13694                                                 named */
13695
13696                 /* \p means they want Unicode semantics */
13697                 RExC_uni_semantics = 1;
13698                 }
13699                 break;
13700             case 'n':   value = '\n';                   break;
13701             case 'r':   value = '\r';                   break;
13702             case 't':   value = '\t';                   break;
13703             case 'f':   value = '\f';                   break;
13704             case 'b':   value = '\b';                   break;
13705             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13706             case 'a':   value = '\a';                   break;
13707             case 'o':
13708                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13709                 {
13710                     const char* error_msg;
13711                     bool valid = grok_bslash_o(&RExC_parse,
13712                                                &value,
13713                                                &error_msg,
13714                                                SIZE_ONLY,   /* warnings in pass
13715                                                                1 only */
13716                                                strict,
13717                                                silence_non_portable,
13718                                                UTF);
13719                     if (! valid) {
13720                         vFAIL(error_msg);
13721                     }
13722                 }
13723                 if (PL_encoding && value < 0x100) {
13724                     goto recode_encoding;
13725                 }
13726                 break;
13727             case 'x':
13728                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13729                 {
13730                     const char* error_msg;
13731                     bool valid = grok_bslash_x(&RExC_parse,
13732                                                &value,
13733                                                &error_msg,
13734                                                TRUE, /* Output warnings */
13735                                                strict,
13736                                                silence_non_portable,
13737                                                UTF);
13738                     if (! valid) {
13739                         vFAIL(error_msg);
13740                     }
13741                 }
13742                 if (PL_encoding && value < 0x100)
13743                     goto recode_encoding;
13744                 break;
13745             case 'c':
13746                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13747                 break;
13748             case '0': case '1': case '2': case '3': case '4':
13749             case '5': case '6': case '7':
13750                 {
13751                     /* Take 1-3 octal digits */
13752                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13753                     numlen = (strict) ? 4 : 3;
13754                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13755                     RExC_parse += numlen;
13756                     if (numlen != 3) {
13757                         if (strict) {
13758                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13759                             vFAIL("Need exactly 3 octal digits");
13760                         }
13761                         else if (! SIZE_ONLY /* like \08, \178 */
13762                                  && numlen < 3
13763                                  && RExC_parse < RExC_end
13764                                  && isDIGIT(*RExC_parse)
13765                                  && ckWARN(WARN_REGEXP))
13766                         {
13767                             SAVEFREESV(RExC_rx_sv);
13768                             reg_warn_non_literal_string(
13769                                  RExC_parse + 1,
13770                                  form_short_octal_warning(RExC_parse, numlen));
13771                             (void)ReREFCNT_inc(RExC_rx_sv);
13772                         }
13773                     }
13774                     if (PL_encoding && value < 0x100)
13775                         goto recode_encoding;
13776                     break;
13777                 }
13778             recode_encoding:
13779                 if (! RExC_override_recoding) {
13780                     SV* enc = PL_encoding;
13781                     value = reg_recode((const char)(U8)value, &enc);
13782                     if (!enc) {
13783                         if (strict) {
13784                             vFAIL("Invalid escape in the specified encoding");
13785                         }
13786                         else if (SIZE_ONLY) {
13787                             ckWARNreg(RExC_parse,
13788                                   "Invalid escape in the specified encoding");
13789                         }
13790                     }
13791                     break;
13792                 }
13793             default:
13794                 /* Allow \_ to not give an error */
13795                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13796                     if (strict) {
13797                         vFAIL2("Unrecognized escape \\%c in character class",
13798                                (int)value);
13799                     }
13800                     else {
13801                         SAVEFREESV(RExC_rx_sv);
13802                         ckWARN2reg(RExC_parse,
13803                             "Unrecognized escape \\%c in character class passed through",
13804                             (int)value);
13805                         (void)ReREFCNT_inc(RExC_rx_sv);
13806                     }
13807                 }
13808                 break;
13809             }   /* End of switch on char following backslash */
13810         } /* end of handling backslash escape sequences */
13811 #ifdef EBCDIC
13812         else
13813             literal_endpoint++;
13814 #endif
13815
13816         /* Here, we have the current token in 'value' */
13817
13818         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13819             U8 classnum;
13820
13821             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13822              * literal, as is the character that began the false range, i.e.
13823              * the 'a' in the examples */
13824             if (range) {
13825                 if (!SIZE_ONLY) {
13826                     const int w = (RExC_parse >= rangebegin)
13827                                   ? RExC_parse - rangebegin
13828                                   : 0;
13829                     if (strict) {
13830                         vFAIL2utf8f(
13831                             "False [] range \"%"UTF8f"\"",
13832                             UTF8fARG(UTF, w, rangebegin));
13833                     }
13834                     else {
13835                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13836                         ckWARN2reg(RExC_parse,
13837                             "False [] range \"%"UTF8f"\"",
13838                             UTF8fARG(UTF, w, rangebegin));
13839                         (void)ReREFCNT_inc(RExC_rx_sv);
13840                         cp_list = add_cp_to_invlist(cp_list, '-');
13841                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13842                                                              prevvalue);
13843                     }
13844                 }
13845
13846                 range = 0; /* this was not a true range */
13847                 element_count += 2; /* So counts for three values */
13848             }
13849
13850             classnum = namedclass_to_classnum(namedclass);
13851
13852             if (LOC && namedclass < ANYOF_POSIXL_MAX
13853 #ifndef HAS_ISASCII
13854                 && classnum != _CC_ASCII
13855 #endif
13856             ) {
13857                 /* What the Posix classes (like \w, [:space:]) match in locale
13858                  * isn't knowable under locale until actual match time.  Room
13859                  * must be reserved (one time per outer bracketed class) to
13860                  * store such classes.  The space will contain a bit for each
13861                  * named class that is to be matched against.  This isn't
13862                  * needed for \p{} and pseudo-classes, as they are not affected
13863                  * by locale, and hence are dealt with separately */
13864                 if (! need_class) {
13865                     need_class = 1;
13866                     if (SIZE_ONLY) {
13867                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13868                     }
13869                     else {
13870                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13871                     }
13872                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13873                     ANYOF_POSIXL_ZERO(ret);
13874                 }
13875
13876                 /* Coverity thinks it is possible for this to be negative; both
13877                  * jhi and khw think it's not, but be safer */
13878                 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13879                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13880
13881                 /* See if it already matches the complement of this POSIX
13882                  * class */
13883                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13884                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13885                                                             ? -1
13886                                                             : 1)))
13887                 {
13888                     posixl_matches_all = TRUE;
13889                     break;  /* No need to continue.  Since it matches both
13890                                e.g., \w and \W, it matches everything, and the
13891                                bracketed class can be optimized into qr/./s */
13892                 }
13893
13894                 /* Add this class to those that should be checked at runtime */
13895                 ANYOF_POSIXL_SET(ret, namedclass);
13896
13897                 /* The above-Latin1 characters are not subject to locale rules.
13898                  * Just add them, in the second pass, to the
13899                  * unconditionally-matched list */
13900                 if (! SIZE_ONLY) {
13901                     SV* scratch_list = NULL;
13902
13903                     /* Get the list of the above-Latin1 code points this
13904                      * matches */
13905                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13906                                           PL_XPosix_ptrs[classnum],
13907
13908                                           /* Odd numbers are complements, like
13909                                            * NDIGIT, NASCII, ... */
13910                                           namedclass % 2 != 0,
13911                                           &scratch_list);
13912                     /* Checking if 'cp_list' is NULL first saves an extra
13913                      * clone.  Its reference count will be decremented at the
13914                      * next union, etc, or if this is the only instance, at the
13915                      * end of the routine */
13916                     if (! cp_list) {
13917                         cp_list = scratch_list;
13918                     }
13919                     else {
13920                         _invlist_union(cp_list, scratch_list, &cp_list);
13921                         SvREFCNT_dec_NN(scratch_list);
13922                     }
13923                     continue;   /* Go get next character */
13924                 }
13925             }
13926             else if (! SIZE_ONLY) {
13927
13928                 /* Here, not in pass1 (in that pass we skip calculating the
13929                  * contents of this class), and is /l, or is a POSIX class for
13930                  * which /l doesn't matter (or is a Unicode property, which is
13931                  * skipped here). */
13932                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13933                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13934
13935                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13936                          * nor /l make a difference in what these match,
13937                          * therefore we just add what they match to cp_list. */
13938                         if (classnum != _CC_VERTSPACE) {
13939                             assert(   namedclass == ANYOF_HORIZWS
13940                                    || namedclass == ANYOF_NHORIZWS);
13941
13942                             /* It turns out that \h is just a synonym for
13943                              * XPosixBlank */
13944                             classnum = _CC_BLANK;
13945                         }
13946
13947                         _invlist_union_maybe_complement_2nd(
13948                                 cp_list,
13949                                 PL_XPosix_ptrs[classnum],
13950                                 namedclass % 2 != 0,    /* Complement if odd
13951                                                           (NHORIZWS, NVERTWS)
13952                                                         */
13953                                 &cp_list);
13954                     }
13955                 }
13956                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13957                            complement and use nposixes */
13958                     SV** posixes_ptr = namedclass % 2 == 0
13959                                        ? &posixes
13960                                        : &nposixes;
13961                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13962                     _invlist_union_maybe_complement_2nd(
13963                                                      *posixes_ptr,
13964                                                      *source_ptr,
13965                                                      namedclass % 2 != 0,
13966                                                      posixes_ptr);
13967                 }
13968                 continue;   /* Go get next character */
13969             }
13970         } /* end of namedclass \blah */
13971
13972         /* Here, we have a single value.  If 'range' is set, it is the ending
13973          * of a range--check its validity.  Later, we will handle each
13974          * individual code point in the range.  If 'range' isn't set, this
13975          * could be the beginning of a range, so check for that by looking
13976          * ahead to see if the next real character to be processed is the range
13977          * indicator--the minus sign */
13978
13979         if (skip_white) {
13980             RExC_parse = regpatws(pRExC_state, RExC_parse,
13981                                 FALSE /* means don't recognize comments */ );
13982         }
13983
13984         if (range) {
13985             if (prevvalue > value) /* b-a */ {
13986                 const int w = RExC_parse - rangebegin;
13987                 vFAIL2utf8f(
13988                     "Invalid [] range \"%"UTF8f"\"",
13989                     UTF8fARG(UTF, w, rangebegin));
13990                 range = 0; /* not a valid range */
13991             }
13992         }
13993         else {
13994             prevvalue = value; /* save the beginning of the potential range */
13995             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13996                 && *RExC_parse == '-')
13997             {
13998                 char* next_char_ptr = RExC_parse + 1;
13999                 if (skip_white) {   /* Get the next real char after the '-' */
14000                     next_char_ptr = regpatws(pRExC_state,
14001                                              RExC_parse + 1,
14002                                              FALSE); /* means don't recognize
14003                                                         comments */
14004                 }
14005
14006                 /* If the '-' is at the end of the class (just before the ']',
14007                  * it is a literal minus; otherwise it is a range */
14008                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14009                     RExC_parse = next_char_ptr;
14010
14011                     /* a bad range like \w-, [:word:]- ? */
14012                     if (namedclass > OOB_NAMEDCLASS) {
14013                         if (strict || ckWARN(WARN_REGEXP)) {
14014                             const int w =
14015                                 RExC_parse >= rangebegin ?
14016                                 RExC_parse - rangebegin : 0;
14017                             if (strict) {
14018                                 vFAIL4("False [] range \"%*.*s\"",
14019                                     w, w, rangebegin);
14020                             }
14021                             else {
14022                                 vWARN4(RExC_parse,
14023                                     "False [] range \"%*.*s\"",
14024                                     w, w, rangebegin);
14025                             }
14026                         }
14027                         if (!SIZE_ONLY) {
14028                             cp_list = add_cp_to_invlist(cp_list, '-');
14029                         }
14030                         element_count++;
14031                     } else
14032                         range = 1;      /* yeah, it's a range! */
14033                     continue;   /* but do it the next time */
14034                 }
14035             }
14036         }
14037
14038         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14039          * if not */
14040
14041         /* non-Latin1 code point implies unicode semantics.  Must be set in
14042          * pass1 so is there for the whole of pass 2 */
14043         if (value > 255) {
14044             RExC_uni_semantics = 1;
14045         }
14046
14047         /* Ready to process either the single value, or the completed range.
14048          * For single-valued non-inverted ranges, we consider the possibility
14049          * of multi-char folds.  (We made a conscious decision to not do this
14050          * for the other cases because it can often lead to non-intuitive
14051          * results.  For example, you have the peculiar case that:
14052          *  "s s" =~ /^[^\xDF]+$/i => Y
14053          *  "ss"  =~ /^[^\xDF]+$/i => N
14054          *
14055          * See [perl #89750] */
14056         if (FOLD && allow_multi_folds && value == prevvalue) {
14057             if (value == LATIN_SMALL_LETTER_SHARP_S
14058                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14059                                                         value)))
14060             {
14061                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14062
14063                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14064                 STRLEN foldlen;
14065
14066                 UV folded = _to_uni_fold_flags(
14067                                 value,
14068                                 foldbuf,
14069                                 &foldlen,
14070                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14071                                                    ? FOLD_FLAGS_NOMIX_ASCII
14072                                                    : 0)
14073                                 );
14074
14075                 /* Here, <folded> should be the first character of the
14076                  * multi-char fold of <value>, with <foldbuf> containing the
14077                  * whole thing.  But, if this fold is not allowed (because of
14078                  * the flags), <fold> will be the same as <value>, and should
14079                  * be processed like any other character, so skip the special
14080                  * handling */
14081                 if (folded != value) {
14082
14083                     /* Skip if we are recursed, currently parsing the class
14084                      * again.  Otherwise add this character to the list of
14085                      * multi-char folds. */
14086                     if (! RExC_in_multi_char_class) {
14087                         AV** this_array_ptr;
14088                         AV* this_array;
14089                         STRLEN cp_count = utf8_length(foldbuf,
14090                                                       foldbuf + foldlen);
14091                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
14092
14093                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14094
14095
14096                         if (! multi_char_matches) {
14097                             multi_char_matches = newAV();
14098                         }
14099
14100                         /* <multi_char_matches> is actually an array of arrays.
14101                          * There will be one or two top-level elements: [2],
14102                          * and/or [3].  The [2] element is an array, each
14103                          * element thereof is a character which folds to TWO
14104                          * characters; [3] is for folds to THREE characters.
14105                          * (Unicode guarantees a maximum of 3 characters in any
14106                          * fold.)  When we rewrite the character class below,
14107                          * we will do so such that the longest folds are
14108                          * written first, so that it prefers the longest
14109                          * matching strings first.  This is done even if it
14110                          * turns out that any quantifier is non-greedy, out of
14111                          * programmer laziness.  Tom Christiansen has agreed
14112                          * that this is ok.  This makes the test for the
14113                          * ligature 'ffi' come before the test for 'ff' */
14114                         if (av_exists(multi_char_matches, cp_count)) {
14115                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14116                                                              cp_count, FALSE);
14117                             this_array = *this_array_ptr;
14118                         }
14119                         else {
14120                             this_array = newAV();
14121                             av_store(multi_char_matches, cp_count,
14122                                      (SV*) this_array);
14123                         }
14124                         av_push(this_array, multi_fold);
14125                     }
14126
14127                     /* This element should not be processed further in this
14128                      * class */
14129                     element_count--;
14130                     value = save_value;
14131                     prevvalue = save_prevvalue;
14132                     continue;
14133                 }
14134             }
14135         }
14136
14137         /* Deal with this element of the class */
14138         if (! SIZE_ONLY) {
14139 #ifndef EBCDIC
14140             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14141                                                      prevvalue, value);
14142 #else
14143             SV* this_range = _new_invlist(1);
14144             _append_range_to_invlist(this_range, prevvalue, value);
14145
14146             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14147              * If this range was specified using something like 'i-j', we want
14148              * to include only the 'i' and the 'j', and not anything in
14149              * between, so exclude non-ASCII, non-alphabetics from it.
14150              * However, if the range was specified with something like
14151              * [\x89-\x91] or [\x89-j], all code points within it should be
14152              * included.  literal_endpoint==2 means both ends of the range used
14153              * a literal character, not \x{foo} */
14154             if (literal_endpoint == 2
14155                 && ((prevvalue >= 'a' && value <= 'z')
14156                     || (prevvalue >= 'A' && value <= 'Z')))
14157             {
14158                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14159                                       &this_range);
14160
14161                 /* Since this above only contains ascii, the intersection of it
14162                  * with anything will still yield only ascii */
14163                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14164                                       &this_range);
14165             }
14166             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14167             literal_endpoint = 0;
14168 #endif
14169         }
14170
14171         range = 0; /* this range (if it was one) is done now */
14172     } /* End of loop through all the text within the brackets */
14173
14174     /* If anything in the class expands to more than one character, we have to
14175      * deal with them by building up a substitute parse string, and recursively
14176      * calling reg() on it, instead of proceeding */
14177     if (multi_char_matches) {
14178         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14179         I32 cp_count;
14180         STRLEN len;
14181         char *save_end = RExC_end;
14182         char *save_parse = RExC_parse;
14183         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14184                                        a "|" */
14185         I32 reg_flags;
14186
14187         assert(! invert);
14188 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14189            because too confusing */
14190         if (invert) {
14191             sv_catpv(substitute_parse, "(?:");
14192         }
14193 #endif
14194
14195         /* Look at the longest folds first */
14196         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14197
14198             if (av_exists(multi_char_matches, cp_count)) {
14199                 AV** this_array_ptr;
14200                 SV* this_sequence;
14201
14202                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14203                                                  cp_count, FALSE);
14204                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14205                                                                 &PL_sv_undef)
14206                 {
14207                     if (! first_time) {
14208                         sv_catpv(substitute_parse, "|");
14209                     }
14210                     first_time = FALSE;
14211
14212                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14213                 }
14214             }
14215         }
14216
14217         /* If the character class contains anything else besides these
14218          * multi-character folds, have to include it in recursive parsing */
14219         if (element_count) {
14220             sv_catpv(substitute_parse, "|[");
14221             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14222             sv_catpv(substitute_parse, "]");
14223         }
14224
14225         sv_catpv(substitute_parse, ")");
14226 #if 0
14227         if (invert) {
14228             /* This is a way to get the parse to skip forward a whole named
14229              * sequence instead of matching the 2nd character when it fails the
14230              * first */
14231             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14232         }
14233 #endif
14234
14235         RExC_parse = SvPV(substitute_parse, len);
14236         RExC_end = RExC_parse + len;
14237         RExC_in_multi_char_class = 1;
14238         RExC_emit = (regnode *)orig_emit;
14239
14240         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14241
14242         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14243
14244         RExC_parse = save_parse;
14245         RExC_end = save_end;
14246         RExC_in_multi_char_class = 0;
14247         SvREFCNT_dec_NN(multi_char_matches);
14248         return ret;
14249     }
14250
14251     /* Here, we've gone through the entire class and dealt with multi-char
14252      * folds.  We are now in a position that we can do some checks to see if we
14253      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14254      * Currently we only do two checks:
14255      * 1) is in the unlikely event that the user has specified both, eg. \w and
14256      *    \W under /l, then the class matches everything.  (This optimization
14257      *    is done only to make the optimizer code run later work.)
14258      * 2) if the character class contains only a single element (including a
14259      *    single range), we see if there is an equivalent node for it.
14260      * Other checks are possible */
14261     if (! ret_invlist   /* Can't optimize if returning the constructed
14262                            inversion list */
14263         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14264     {
14265         U8 op = END;
14266         U8 arg = 0;
14267
14268         if (UNLIKELY(posixl_matches_all)) {
14269             op = SANY;
14270         }
14271         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14272                                                    \w or [:digit:] or \p{foo}
14273                                                  */
14274
14275             /* All named classes are mapped into POSIXish nodes, with its FLAG
14276              * argument giving which class it is */
14277             switch ((I32)namedclass) {
14278                 case ANYOF_UNIPROP:
14279                     break;
14280
14281                 /* These don't depend on the charset modifiers.  They always
14282                  * match under /u rules */
14283                 case ANYOF_NHORIZWS:
14284                 case ANYOF_HORIZWS:
14285                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14286                     /* FALLTHROUGH */
14287
14288                 case ANYOF_NVERTWS:
14289                 case ANYOF_VERTWS:
14290                     op = POSIXU;
14291                     goto join_posix;
14292
14293                 /* The actual POSIXish node for all the rest depends on the
14294                  * charset modifier.  The ones in the first set depend only on
14295                  * ASCII or, if available on this platform, locale */
14296                 case ANYOF_ASCII:
14297                 case ANYOF_NASCII:
14298 #ifdef HAS_ISASCII
14299                     op = (LOC) ? POSIXL : POSIXA;
14300 #else
14301                     op = POSIXA;
14302 #endif
14303                     goto join_posix;
14304
14305                 case ANYOF_NCASED:
14306                 case ANYOF_LOWER:
14307                 case ANYOF_NLOWER:
14308                 case ANYOF_UPPER:
14309                 case ANYOF_NUPPER:
14310                     /* under /a could be alpha */
14311                     if (FOLD) {
14312                         if (ASCII_RESTRICTED) {
14313                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14314                         }
14315                         else if (! LOC) {
14316                             break;
14317                         }
14318                     }
14319                     /* FALLTHROUGH */
14320
14321                 /* The rest have more possibilities depending on the charset.
14322                  * We take advantage of the enum ordering of the charset
14323                  * modifiers to get the exact node type, */
14324                 default:
14325                     op = POSIXD + get_regex_charset(RExC_flags);
14326                     if (op > POSIXA) { /* /aa is same as /a */
14327                         op = POSIXA;
14328                     }
14329
14330                 join_posix:
14331                     /* The odd numbered ones are the complements of the
14332                      * next-lower even number one */
14333                     if (namedclass % 2 == 1) {
14334                         invert = ! invert;
14335                         namedclass--;
14336                     }
14337                     arg = namedclass_to_classnum(namedclass);
14338                     break;
14339             }
14340         }
14341         else if (value == prevvalue) {
14342
14343             /* Here, the class consists of just a single code point */
14344
14345             if (invert) {
14346                 if (! LOC && value == '\n') {
14347                     op = REG_ANY; /* Optimize [^\n] */
14348                     *flagp |= HASWIDTH|SIMPLE;
14349                     RExC_naughty++;
14350                 }
14351             }
14352             else if (value < 256 || UTF) {
14353
14354                 /* Optimize a single value into an EXACTish node, but not if it
14355                  * would require converting the pattern to UTF-8. */
14356                 op = compute_EXACTish(pRExC_state);
14357             }
14358         } /* Otherwise is a range */
14359         else if (! LOC) {   /* locale could vary these */
14360             if (prevvalue == '0') {
14361                 if (value == '9') {
14362                     arg = _CC_DIGIT;
14363                     op = POSIXA;
14364                 }
14365             }
14366             else if (prevvalue == 'A') {
14367                 if (value == 'Z'
14368 #ifdef EBCDIC
14369                     && literal_endpoint == 2
14370 #endif
14371                 ) {
14372                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14373                     op = POSIXA;
14374                 }
14375             }
14376             else if (prevvalue == 'a') {
14377                 if (value == 'z'
14378 #ifdef EBCDIC
14379                     && literal_endpoint == 2
14380 #endif
14381                 ) {
14382                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14383                     op = POSIXA;
14384                 }
14385             }
14386         }
14387
14388         /* Here, we have changed <op> away from its initial value iff we found
14389          * an optimization */
14390         if (op != END) {
14391
14392             /* Throw away this ANYOF regnode, and emit the calculated one,
14393              * which should correspond to the beginning, not current, state of
14394              * the parse */
14395             const char * cur_parse = RExC_parse;
14396             RExC_parse = (char *)orig_parse;
14397             if ( SIZE_ONLY) {
14398                 if (! LOC) {
14399
14400                     /* To get locale nodes to not use the full ANYOF size would
14401                      * require moving the code above that writes the portions
14402                      * of it that aren't in other nodes to after this point.
14403                      * e.g.  ANYOF_POSIXL_SET */
14404                     RExC_size = orig_size;
14405                 }
14406             }
14407             else {
14408                 RExC_emit = (regnode *)orig_emit;
14409                 if (PL_regkind[op] == POSIXD) {
14410                     if (op == POSIXL) {
14411                         RExC_contains_locale = 1;
14412                     }
14413                     if (invert) {
14414                         op += NPOSIXD - POSIXD;
14415                     }
14416                 }
14417             }
14418
14419             ret = reg_node(pRExC_state, op);
14420
14421             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14422                 if (! SIZE_ONLY) {
14423                     FLAGS(ret) = arg;
14424                 }
14425                 *flagp |= HASWIDTH|SIMPLE;
14426             }
14427             else if (PL_regkind[op] == EXACT) {
14428                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14429                                            TRUE /* downgradable to EXACT */
14430                                            );
14431             }
14432
14433             RExC_parse = (char *) cur_parse;
14434
14435             SvREFCNT_dec(posixes);
14436             SvREFCNT_dec(nposixes);
14437             SvREFCNT_dec(cp_list);
14438             SvREFCNT_dec(cp_foldable_list);
14439             return ret;
14440         }
14441     }
14442
14443     if (SIZE_ONLY)
14444         return ret;
14445     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14446
14447     /* If folding, we calculate all characters that could fold to or from the
14448      * ones already on the list */
14449     if (cp_foldable_list) {
14450         if (FOLD) {
14451             UV start, end;      /* End points of code point ranges */
14452
14453             SV* fold_intersection = NULL;
14454             SV** use_list;
14455
14456             /* Our calculated list will be for Unicode rules.  For locale
14457              * matching, we have to keep a separate list that is consulted at
14458              * runtime only when the locale indicates Unicode rules.  For
14459              * non-locale, we just use to the general list */
14460             if (LOC) {
14461                 use_list = &only_utf8_locale_list;
14462             }
14463             else {
14464                 use_list = &cp_list;
14465             }
14466
14467             /* Only the characters in this class that participate in folds need
14468              * be checked.  Get the intersection of this class and all the
14469              * possible characters that are foldable.  This can quickly narrow
14470              * down a large class */
14471             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14472                                   &fold_intersection);
14473
14474             /* The folds for all the Latin1 characters are hard-coded into this
14475              * program, but we have to go out to disk to get the others. */
14476             if (invlist_highest(cp_foldable_list) >= 256) {
14477
14478                 /* This is a hash that for a particular fold gives all
14479                  * characters that are involved in it */
14480                 if (! PL_utf8_foldclosures) {
14481                     _load_PL_utf8_foldclosures();
14482                 }
14483             }
14484
14485             /* Now look at the foldable characters in this class individually */
14486             invlist_iterinit(fold_intersection);
14487             while (invlist_iternext(fold_intersection, &start, &end)) {
14488                 UV j;
14489
14490                 /* Look at every character in the range */
14491                 for (j = start; j <= end; j++) {
14492                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14493                     STRLEN foldlen;
14494                     SV** listp;
14495
14496                     if (j < 256) {
14497
14498                         if (IS_IN_SOME_FOLD_L1(j)) {
14499
14500                             /* ASCII is always matched; non-ASCII is matched
14501                              * only under Unicode rules (which could happen
14502                              * under /l if the locale is a UTF-8 one */
14503                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14504                                 *use_list = add_cp_to_invlist(*use_list,
14505                                                             PL_fold_latin1[j]);
14506                             }
14507                             else {
14508                                 depends_list =
14509                                  add_cp_to_invlist(depends_list,
14510                                                    PL_fold_latin1[j]);
14511                             }
14512                         }
14513
14514                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14515                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14516                         {
14517                             add_above_Latin1_folds(pRExC_state,
14518                                                    (U8) j,
14519                                                    use_list);
14520                         }
14521                         continue;
14522                     }
14523
14524                     /* Here is an above Latin1 character.  We don't have the
14525                      * rules hard-coded for it.  First, get its fold.  This is
14526                      * the simple fold, as the multi-character folds have been
14527                      * handled earlier and separated out */
14528                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14529                                                         (ASCII_FOLD_RESTRICTED)
14530                                                         ? FOLD_FLAGS_NOMIX_ASCII
14531                                                         : 0);
14532
14533                     /* Single character fold of above Latin1.  Add everything in
14534                     * its fold closure to the list that this node should match.
14535                     * The fold closures data structure is a hash with the keys
14536                     * being the UTF-8 of every character that is folded to, like
14537                     * 'k', and the values each an array of all code points that
14538                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14539                     * Multi-character folds are not included */
14540                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14541                                         (char *) foldbuf, foldlen, FALSE)))
14542                     {
14543                         AV* list = (AV*) *listp;
14544                         IV k;
14545                         for (k = 0; k <= av_tindex(list); k++) {
14546                             SV** c_p = av_fetch(list, k, FALSE);
14547                             UV c;
14548                             assert(c_p);
14549
14550                             c = SvUV(*c_p);
14551
14552                             /* /aa doesn't allow folds between ASCII and non- */
14553                             if ((ASCII_FOLD_RESTRICTED
14554                                 && (isASCII(c) != isASCII(j))))
14555                             {
14556                                 continue;
14557                             }
14558
14559                             /* Folds under /l which cross the 255/256 boundary
14560                              * are added to a separate list.  (These are valid
14561                              * only when the locale is UTF-8.) */
14562                             if (c < 256 && LOC) {
14563                                 *use_list = add_cp_to_invlist(*use_list, c);
14564                                 continue;
14565                             }
14566
14567                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14568                             {
14569                                 cp_list = add_cp_to_invlist(cp_list, c);
14570                             }
14571                             else {
14572                                 /* Similarly folds involving non-ascii Latin1
14573                                 * characters under /d are added to their list */
14574                                 depends_list = add_cp_to_invlist(depends_list,
14575                                                                  c);
14576                             }
14577                         }
14578                     }
14579                 }
14580             }
14581             SvREFCNT_dec_NN(fold_intersection);
14582         }
14583
14584         /* Now that we have finished adding all the folds, there is no reason
14585          * to keep the foldable list separate */
14586         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14587         SvREFCNT_dec_NN(cp_foldable_list);
14588     }
14589
14590     /* And combine the result (if any) with any inversion list from posix
14591      * classes.  The lists are kept separate up to now because we don't want to
14592      * fold the classes (folding of those is automatically handled by the swash
14593      * fetching code) */
14594     if (posixes || nposixes) {
14595         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14596             /* Under /a and /aa, nothing above ASCII matches these */
14597             _invlist_intersection(posixes,
14598                                   PL_XPosix_ptrs[_CC_ASCII],
14599                                   &posixes);
14600         }
14601         if (nposixes) {
14602             if (DEPENDS_SEMANTICS) {
14603                 /* Under /d, everything in the upper half of the Latin1 range
14604                  * matches these complements */
14605                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14606             }
14607             else if (AT_LEAST_ASCII_RESTRICTED) {
14608                 /* Under /a and /aa, everything above ASCII matches these
14609                  * complements */
14610                 _invlist_union_complement_2nd(nposixes,
14611                                               PL_XPosix_ptrs[_CC_ASCII],
14612                                               &nposixes);
14613             }
14614             if (posixes) {
14615                 _invlist_union(posixes, nposixes, &posixes);
14616                 SvREFCNT_dec_NN(nposixes);
14617             }
14618             else {
14619                 posixes = nposixes;
14620             }
14621         }
14622         if (! DEPENDS_SEMANTICS) {
14623             if (cp_list) {
14624                 _invlist_union(cp_list, posixes, &cp_list);
14625                 SvREFCNT_dec_NN(posixes);
14626             }
14627             else {
14628                 cp_list = posixes;
14629             }
14630         }
14631         else {
14632             /* Under /d, we put into a separate list the Latin1 things that
14633              * match only when the target string is utf8 */
14634             SV* nonascii_but_latin1_properties = NULL;
14635             _invlist_intersection(posixes, PL_UpperLatin1,
14636                                   &nonascii_but_latin1_properties);
14637             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14638                               &posixes);
14639             if (cp_list) {
14640                 _invlist_union(cp_list, posixes, &cp_list);
14641                 SvREFCNT_dec_NN(posixes);
14642             }
14643             else {
14644                 cp_list = posixes;
14645             }
14646
14647             if (depends_list) {
14648                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14649                                &depends_list);
14650                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14651             }
14652             else {
14653                 depends_list = nonascii_but_latin1_properties;
14654             }
14655         }
14656     }
14657
14658     /* And combine the result (if any) with any inversion list from properties.
14659      * The lists are kept separate up to now so that we can distinguish the two
14660      * in regards to matching above-Unicode.  A run-time warning is generated
14661      * if a Unicode property is matched against a non-Unicode code point. But,
14662      * we allow user-defined properties to match anything, without any warning,
14663      * and we also suppress the warning if there is a portion of the character
14664      * class that isn't a Unicode property, and which matches above Unicode, \W
14665      * or [\x{110000}] for example.
14666      * (Note that in this case, unlike the Posix one above, there is no
14667      * <depends_list>, because having a Unicode property forces Unicode
14668      * semantics */
14669     if (properties) {
14670         if (cp_list) {
14671
14672             /* If it matters to the final outcome, see if a non-property
14673              * component of the class matches above Unicode.  If so, the
14674              * warning gets suppressed.  This is true even if just a single
14675              * such code point is specified, as though not strictly correct if
14676              * another such code point is matched against, the fact that they
14677              * are using above-Unicode code points indicates they should know
14678              * the issues involved */
14679             if (warn_super) {
14680                 warn_super = ! (invert
14681                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14682             }
14683
14684             _invlist_union(properties, cp_list, &cp_list);
14685             SvREFCNT_dec_NN(properties);
14686         }
14687         else {
14688             cp_list = properties;
14689         }
14690
14691         if (warn_super) {
14692             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14693         }
14694     }
14695
14696     /* Here, we have calculated what code points should be in the character
14697      * class.
14698      *
14699      * Now we can see about various optimizations.  Fold calculation (which we
14700      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14701      * would invert to include K, which under /i would match k, which it
14702      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14703      * folded until runtime */
14704
14705     /* If we didn't do folding, it's because some information isn't available
14706      * until runtime; set the run-time fold flag for these.  (We don't have to
14707      * worry about properties folding, as that is taken care of by the swash
14708      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14709      * locales, or the class matches at least one 0-255 range code point */
14710     if (LOC && FOLD) {
14711         if (only_utf8_locale_list) {
14712             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14713         }
14714         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14715                                the list */
14716             UV start, end;
14717             invlist_iterinit(cp_list);
14718             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14719                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14720             }
14721             invlist_iterfinish(cp_list);
14722         }
14723     }
14724
14725     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14726      * at compile time.  Besides not inverting folded locale now, we can't
14727      * invert if there are things such as \w, which aren't known until runtime
14728      * */
14729     if (cp_list
14730         && invert
14731         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14732         && ! depends_list
14733         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14734     {
14735         _invlist_invert(cp_list);
14736
14737         /* Any swash can't be used as-is, because we've inverted things */
14738         if (swash) {
14739             SvREFCNT_dec_NN(swash);
14740             swash = NULL;
14741         }
14742
14743         /* Clear the invert flag since have just done it here */
14744         invert = FALSE;
14745     }
14746
14747     if (ret_invlist) {
14748         *ret_invlist = cp_list;
14749         SvREFCNT_dec(swash);
14750
14751         /* Discard the generated node */
14752         if (SIZE_ONLY) {
14753             RExC_size = orig_size;
14754         }
14755         else {
14756             RExC_emit = orig_emit;
14757         }
14758         return orig_emit;
14759     }
14760
14761     /* Some character classes are equivalent to other nodes.  Such nodes take
14762      * up less room and generally fewer operations to execute than ANYOF nodes.
14763      * Above, we checked for and optimized into some such equivalents for
14764      * certain common classes that are easy to test.  Getting to this point in
14765      * the code means that the class didn't get optimized there.  Since this
14766      * code is only executed in Pass 2, it is too late to save space--it has
14767      * been allocated in Pass 1, and currently isn't given back.  But turning
14768      * things into an EXACTish node can allow the optimizer to join it to any
14769      * adjacent such nodes.  And if the class is equivalent to things like /./,
14770      * expensive run-time swashes can be avoided.  Now that we have more
14771      * complete information, we can find things necessarily missed by the
14772      * earlier code.  I (khw) am not sure how much to look for here.  It would
14773      * be easy, but perhaps too slow, to check any candidates against all the
14774      * node types they could possibly match using _invlistEQ(). */
14775
14776     if (cp_list
14777         && ! invert
14778         && ! depends_list
14779         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14780         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14781
14782            /* We don't optimize if we are supposed to make sure all non-Unicode
14783             * code points raise a warning, as only ANYOF nodes have this check.
14784             * */
14785         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14786     {
14787         UV start, end;
14788         U8 op = END;  /* The optimzation node-type */
14789         const char * cur_parse= RExC_parse;
14790
14791         invlist_iterinit(cp_list);
14792         if (! invlist_iternext(cp_list, &start, &end)) {
14793
14794             /* Here, the list is empty.  This happens, for example, when a
14795              * Unicode property is the only thing in the character class, and
14796              * it doesn't match anything.  (perluniprops.pod notes such
14797              * properties) */
14798             op = OPFAIL;
14799             *flagp |= HASWIDTH|SIMPLE;
14800         }
14801         else if (start == end) {    /* The range is a single code point */
14802             if (! invlist_iternext(cp_list, &start, &end)
14803
14804                     /* Don't do this optimization if it would require changing
14805                      * the pattern to UTF-8 */
14806                 && (start < 256 || UTF))
14807             {
14808                 /* Here, the list contains a single code point.  Can optimize
14809                  * into an EXACTish node */
14810
14811                 value = start;
14812
14813                 if (! FOLD) {
14814                     op = EXACT;
14815                 }
14816                 else if (LOC) {
14817
14818                     /* A locale node under folding with one code point can be
14819                      * an EXACTFL, as its fold won't be calculated until
14820                      * runtime */
14821                     op = EXACTFL;
14822                 }
14823                 else {
14824
14825                     /* Here, we are generally folding, but there is only one
14826                      * code point to match.  If we have to, we use an EXACT
14827                      * node, but it would be better for joining with adjacent
14828                      * nodes in the optimization pass if we used the same
14829                      * EXACTFish node that any such are likely to be.  We can
14830                      * do this iff the code point doesn't participate in any
14831                      * folds.  For example, an EXACTF of a colon is the same as
14832                      * an EXACT one, since nothing folds to or from a colon. */
14833                     if (value < 256) {
14834                         if (IS_IN_SOME_FOLD_L1(value)) {
14835                             op = EXACT;
14836                         }
14837                     }
14838                     else {
14839                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14840                             op = EXACT;
14841                         }
14842                     }
14843
14844                     /* If we haven't found the node type, above, it means we
14845                      * can use the prevailing one */
14846                     if (op == END) {
14847                         op = compute_EXACTish(pRExC_state);
14848                     }
14849                 }
14850             }
14851         }
14852         else if (start == 0) {
14853             if (end == UV_MAX) {
14854                 op = SANY;
14855                 *flagp |= HASWIDTH|SIMPLE;
14856                 RExC_naughty++;
14857             }
14858             else if (end == '\n' - 1
14859                     && invlist_iternext(cp_list, &start, &end)
14860                     && start == '\n' + 1 && end == UV_MAX)
14861             {
14862                 op = REG_ANY;
14863                 *flagp |= HASWIDTH|SIMPLE;
14864                 RExC_naughty++;
14865             }
14866         }
14867         invlist_iterfinish(cp_list);
14868
14869         if (op != END) {
14870             RExC_parse = (char *)orig_parse;
14871             RExC_emit = (regnode *)orig_emit;
14872
14873             ret = reg_node(pRExC_state, op);
14874
14875             RExC_parse = (char *)cur_parse;
14876
14877             if (PL_regkind[op] == EXACT) {
14878                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14879                                            TRUE /* downgradable to EXACT */
14880                                           );
14881             }
14882
14883             SvREFCNT_dec_NN(cp_list);
14884             return ret;
14885         }
14886     }
14887
14888     /* Here, <cp_list> contains all the code points we can determine at
14889      * compile time that match under all conditions.  Go through it, and
14890      * for things that belong in the bitmap, put them there, and delete from
14891      * <cp_list>.  While we are at it, see if everything above 255 is in the
14892      * list, and if so, set a flag to speed up execution */
14893
14894     populate_ANYOF_from_invlist(ret, &cp_list);
14895
14896     if (invert) {
14897         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14898     }
14899
14900     /* Here, the bitmap has been populated with all the Latin1 code points that
14901      * always match.  Can now add to the overall list those that match only
14902      * when the target string is UTF-8 (<depends_list>). */
14903     if (depends_list) {
14904         if (cp_list) {
14905             _invlist_union(cp_list, depends_list, &cp_list);
14906             SvREFCNT_dec_NN(depends_list);
14907         }
14908         else {
14909             cp_list = depends_list;
14910         }
14911         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14912     }
14913
14914     /* If there is a swash and more than one element, we can't use the swash in
14915      * the optimization below. */
14916     if (swash && element_count > 1) {
14917         SvREFCNT_dec_NN(swash);
14918         swash = NULL;
14919     }
14920
14921     set_ANYOF_arg(pRExC_state, ret, cp_list,
14922                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14923                    ? listsv : NULL,
14924                   only_utf8_locale_list,
14925                   swash, has_user_defined_property);
14926
14927     *flagp |= HASWIDTH|SIMPLE;
14928
14929     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14930         RExC_contains_locale = 1;
14931     }
14932
14933     return ret;
14934 }
14935
14936 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14937
14938 STATIC void
14939 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14940                 regnode* const node,
14941                 SV* const cp_list,
14942                 SV* const runtime_defns,
14943                 SV* const only_utf8_locale_list,
14944                 SV* const swash,
14945                 const bool has_user_defined_property)
14946 {
14947     /* Sets the arg field of an ANYOF-type node 'node', using information about
14948      * the node passed-in.  If there is nothing outside the node's bitmap, the
14949      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14950      * the count returned by add_data(), having allocated and stored an array,
14951      * av, that that count references, as follows:
14952      *  av[0] stores the character class description in its textual form.
14953      *        This is used later (regexec.c:Perl_regclass_swash()) to
14954      *        initialize the appropriate swash, and is also useful for dumping
14955      *        the regnode.  This is set to &PL_sv_undef if the textual
14956      *        description is not needed at run-time (as happens if the other
14957      *        elements completely define the class)
14958      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14959      *        computed from av[0].  But if no further computation need be done,
14960      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14961      *  av[2] stores the inversion list of code points that match only if the
14962      *        current locale is UTF-8
14963      *  av[3] stores the cp_list inversion list for use in addition or instead
14964      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14965      *        (Otherwise everything needed is already in av[0] and av[1])
14966      *  av[4] is set if any component of the class is from a user-defined
14967      *        property; used only if av[3] exists */
14968
14969     UV n;
14970
14971     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14972
14973     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14974         assert(! (ANYOF_FLAGS(node)
14975                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14976         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14977     }
14978     else {
14979         AV * const av = newAV();
14980         SV *rv;
14981
14982         assert(ANYOF_FLAGS(node)
14983                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14984
14985         av_store(av, 0, (runtime_defns)
14986                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14987         if (swash) {
14988             assert(cp_list);
14989             av_store(av, 1, swash);
14990             SvREFCNT_dec_NN(cp_list);
14991         }
14992         else {
14993             av_store(av, 1, &PL_sv_undef);
14994             if (cp_list) {
14995                 av_store(av, 3, cp_list);
14996                 av_store(av, 4, newSVuv(has_user_defined_property));
14997             }
14998         }
14999
15000         if (only_utf8_locale_list) {
15001             av_store(av, 2, only_utf8_locale_list);
15002         }
15003         else {
15004             av_store(av, 2, &PL_sv_undef);
15005         }
15006
15007         rv = newRV_noinc(MUTABLE_SV(av));
15008         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15009         RExC_rxi->data->data[n] = (void*)rv;
15010         ARG_SET(node, n);
15011     }
15012 }
15013
15014
15015 /* reg_skipcomment()
15016
15017    Absorbs an /x style # comment from the input stream,
15018    returning a pointer to the first character beyond the comment, or if the
15019    comment terminates the pattern without anything following it, this returns
15020    one past the final character of the pattern (in other words, RExC_end) and
15021    sets the REG_RUN_ON_COMMENT_SEEN flag.
15022
15023    Note it's the callers responsibility to ensure that we are
15024    actually in /x mode
15025
15026 */
15027
15028 PERL_STATIC_INLINE char*
15029 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
15030 {
15031     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15032
15033     assert(*p = '#');
15034
15035     while (p < RExC_end) {
15036         if (*(++p) == '\n') {
15037             return p+1;
15038         }
15039     }
15040
15041     /* we ran off the end of the pattern without ending the comment, so we have
15042      * to add an \n when wrapping */
15043     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15044     return p;
15045 }
15046
15047 /* nextchar()
15048
15049    Advances the parse position, and optionally absorbs
15050    "whitespace" from the inputstream.
15051
15052    Without /x "whitespace" means (?#...) style comments only,
15053    with /x this means (?#...) and # comments and whitespace proper.
15054
15055    Returns the RExC_parse point from BEFORE the scan occurs.
15056
15057    This is the /x friendly way of saying RExC_parse++.
15058 */
15059
15060 STATIC char*
15061 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15062 {
15063     char* const retval = RExC_parse++;
15064
15065     PERL_ARGS_ASSERT_NEXTCHAR;
15066
15067     for (;;) {
15068         if (RExC_end - RExC_parse >= 3
15069             && *RExC_parse == '('
15070             && RExC_parse[1] == '?'
15071             && RExC_parse[2] == '#')
15072         {
15073             while (*RExC_parse != ')') {
15074                 if (RExC_parse == RExC_end)
15075                     FAIL("Sequence (?#... not terminated");
15076                 RExC_parse++;
15077             }
15078             RExC_parse++;
15079             continue;
15080         }
15081         if (RExC_flags & RXf_PMf_EXTENDED) {
15082             char * p = regpatws(pRExC_state, RExC_parse,
15083                                           TRUE); /* means recognize comments */
15084             if (p != RExC_parse) {
15085                 RExC_parse = p;
15086                 continue;
15087             }
15088         }
15089         return retval;
15090     }
15091 }
15092
15093 /*
15094 - reg_node - emit a node
15095 */
15096 STATIC regnode *                        /* Location. */
15097 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15098 {
15099     dVAR;
15100     regnode *ptr;
15101     regnode * const ret = RExC_emit;
15102     GET_RE_DEBUG_FLAGS_DECL;
15103
15104     PERL_ARGS_ASSERT_REG_NODE;
15105
15106     if (SIZE_ONLY) {
15107         SIZE_ALIGN(RExC_size);
15108         RExC_size += 1;
15109         return(ret);
15110     }
15111     if (RExC_emit >= RExC_emit_bound)
15112         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15113                    op, RExC_emit, RExC_emit_bound);
15114
15115     NODE_ALIGN_FILL(ret);
15116     ptr = ret;
15117     FILL_ADVANCE_NODE(ptr, op);
15118 #ifdef RE_TRACK_PATTERN_OFFSETS
15119     if (RExC_offsets) {         /* MJD */
15120         MJD_OFFSET_DEBUG(
15121               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15122               "reg_node", __LINE__,
15123               PL_reg_name[op],
15124               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15125                 ? "Overwriting end of array!\n" : "OK",
15126               (UV)(RExC_emit - RExC_emit_start),
15127               (UV)(RExC_parse - RExC_start),
15128               (UV)RExC_offsets[0]));
15129         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15130     }
15131 #endif
15132     RExC_emit = ptr;
15133     return(ret);
15134 }
15135
15136 /*
15137 - reganode - emit a node with an argument
15138 */
15139 STATIC regnode *                        /* Location. */
15140 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15141 {
15142     dVAR;
15143     regnode *ptr;
15144     regnode * const ret = RExC_emit;
15145     GET_RE_DEBUG_FLAGS_DECL;
15146
15147     PERL_ARGS_ASSERT_REGANODE;
15148
15149     if (SIZE_ONLY) {
15150         SIZE_ALIGN(RExC_size);
15151         RExC_size += 2;
15152         /*
15153            We can't do this:
15154
15155            assert(2==regarglen[op]+1);
15156
15157            Anything larger than this has to allocate the extra amount.
15158            If we changed this to be:
15159
15160            RExC_size += (1 + regarglen[op]);
15161
15162            then it wouldn't matter. Its not clear what side effect
15163            might come from that so its not done so far.
15164            -- dmq
15165         */
15166         return(ret);
15167     }
15168     if (RExC_emit >= RExC_emit_bound)
15169         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15170                    op, RExC_emit, RExC_emit_bound);
15171
15172     NODE_ALIGN_FILL(ret);
15173     ptr = ret;
15174     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15175 #ifdef RE_TRACK_PATTERN_OFFSETS
15176     if (RExC_offsets) {         /* MJD */
15177         MJD_OFFSET_DEBUG(
15178               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15179               "reganode",
15180               __LINE__,
15181               PL_reg_name[op],
15182               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15183               "Overwriting end of array!\n" : "OK",
15184               (UV)(RExC_emit - RExC_emit_start),
15185               (UV)(RExC_parse - RExC_start),
15186               (UV)RExC_offsets[0]));
15187         Set_Cur_Node_Offset;
15188     }
15189 #endif
15190     RExC_emit = ptr;
15191     return(ret);
15192 }
15193
15194 /*
15195 - reguni - emit (if appropriate) a Unicode character
15196 */
15197 PERL_STATIC_INLINE STRLEN
15198 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15199 {
15200     dVAR;
15201
15202     PERL_ARGS_ASSERT_REGUNI;
15203
15204     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15205 }
15206
15207 /*
15208 - reginsert - insert an operator in front of already-emitted operand
15209 *
15210 * Means relocating the operand.
15211 */
15212 STATIC void
15213 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15214 {
15215     dVAR;
15216     regnode *src;
15217     regnode *dst;
15218     regnode *place;
15219     const int offset = regarglen[(U8)op];
15220     const int size = NODE_STEP_REGNODE + offset;
15221     GET_RE_DEBUG_FLAGS_DECL;
15222
15223     PERL_ARGS_ASSERT_REGINSERT;
15224     PERL_UNUSED_ARG(depth);
15225 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15226     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15227     if (SIZE_ONLY) {
15228         RExC_size += size;
15229         return;
15230     }
15231
15232     src = RExC_emit;
15233     RExC_emit += size;
15234     dst = RExC_emit;
15235     if (RExC_open_parens) {
15236         int paren;
15237         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15238         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15239             if ( RExC_open_parens[paren] >= opnd ) {
15240                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15241                 RExC_open_parens[paren] += size;
15242             } else {
15243                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15244             }
15245             if ( RExC_close_parens[paren] >= opnd ) {
15246                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15247                 RExC_close_parens[paren] += size;
15248             } else {
15249                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15250             }
15251         }
15252     }
15253
15254     while (src > opnd) {
15255         StructCopy(--src, --dst, regnode);
15256 #ifdef RE_TRACK_PATTERN_OFFSETS
15257         if (RExC_offsets) {     /* MJD 20010112 */
15258             MJD_OFFSET_DEBUG(
15259                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15260                   "reg_insert",
15261                   __LINE__,
15262                   PL_reg_name[op],
15263                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15264                     ? "Overwriting end of array!\n" : "OK",
15265                   (UV)(src - RExC_emit_start),
15266                   (UV)(dst - RExC_emit_start),
15267                   (UV)RExC_offsets[0]));
15268             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15269             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15270         }
15271 #endif
15272     }
15273
15274
15275     place = opnd;               /* Op node, where operand used to be. */
15276 #ifdef RE_TRACK_PATTERN_OFFSETS
15277     if (RExC_offsets) {         /* MJD */
15278         MJD_OFFSET_DEBUG(
15279               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15280               "reginsert",
15281               __LINE__,
15282               PL_reg_name[op],
15283               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15284               ? "Overwriting end of array!\n" : "OK",
15285               (UV)(place - RExC_emit_start),
15286               (UV)(RExC_parse - RExC_start),
15287               (UV)RExC_offsets[0]));
15288         Set_Node_Offset(place, RExC_parse);
15289         Set_Node_Length(place, 1);
15290     }
15291 #endif
15292     src = NEXTOPER(place);
15293     FILL_ADVANCE_NODE(place, op);
15294     Zero(src, offset, regnode);
15295 }
15296
15297 /*
15298 - regtail - set the next-pointer at the end of a node chain of p to val.
15299 - SEE ALSO: regtail_study
15300 */
15301 /* TODO: All three parms should be const */
15302 STATIC void
15303 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15304                 const regnode *val,U32 depth)
15305 {
15306     dVAR;
15307     regnode *scan;
15308     GET_RE_DEBUG_FLAGS_DECL;
15309
15310     PERL_ARGS_ASSERT_REGTAIL;
15311 #ifndef DEBUGGING
15312     PERL_UNUSED_ARG(depth);
15313 #endif
15314
15315     if (SIZE_ONLY)
15316         return;
15317
15318     /* Find last node. */
15319     scan = p;
15320     for (;;) {
15321         regnode * const temp = regnext(scan);
15322         DEBUG_PARSE_r({
15323             SV * const mysv=sv_newmortal();
15324             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15325             regprop(RExC_rx, mysv, scan, NULL);
15326             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15327                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15328                     (temp == NULL ? "->" : ""),
15329                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15330             );
15331         });
15332         if (temp == NULL)
15333             break;
15334         scan = temp;
15335     }
15336
15337     if (reg_off_by_arg[OP(scan)]) {
15338         ARG_SET(scan, val - scan);
15339     }
15340     else {
15341         NEXT_OFF(scan) = val - scan;
15342     }
15343 }
15344
15345 #ifdef DEBUGGING
15346 /*
15347 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15348 - Look for optimizable sequences at the same time.
15349 - currently only looks for EXACT chains.
15350
15351 This is experimental code. The idea is to use this routine to perform
15352 in place optimizations on branches and groups as they are constructed,
15353 with the long term intention of removing optimization from study_chunk so
15354 that it is purely analytical.
15355
15356 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15357 to control which is which.
15358
15359 */
15360 /* TODO: All four parms should be const */
15361
15362 STATIC U8
15363 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15364                       const regnode *val,U32 depth)
15365 {
15366     dVAR;
15367     regnode *scan;
15368     U8 exact = PSEUDO;
15369 #ifdef EXPERIMENTAL_INPLACESCAN
15370     I32 min = 0;
15371 #endif
15372     GET_RE_DEBUG_FLAGS_DECL;
15373
15374     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15375
15376
15377     if (SIZE_ONLY)
15378         return exact;
15379
15380     /* Find last node. */
15381
15382     scan = p;
15383     for (;;) {
15384         regnode * const temp = regnext(scan);
15385 #ifdef EXPERIMENTAL_INPLACESCAN
15386         if (PL_regkind[OP(scan)] == EXACT) {
15387             bool unfolded_multi_char;   /* Unexamined in this routine */
15388             if (join_exact(pRExC_state, scan, &min,
15389                            &unfolded_multi_char, 1, val, depth+1))
15390                 return EXACT;
15391         }
15392 #endif
15393         if ( exact ) {
15394             switch (OP(scan)) {
15395                 case EXACT:
15396                 case EXACTF:
15397                 case EXACTFA_NO_TRIE:
15398                 case EXACTFA:
15399                 case EXACTFU:
15400                 case EXACTFU_SS:
15401                 case EXACTFL:
15402                         if( exact == PSEUDO )
15403                             exact= OP(scan);
15404                         else if ( exact != OP(scan) )
15405                             exact= 0;
15406                 case NOTHING:
15407                     break;
15408                 default:
15409                     exact= 0;
15410             }
15411         }
15412         DEBUG_PARSE_r({
15413             SV * const mysv=sv_newmortal();
15414             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15415             regprop(RExC_rx, mysv, scan, NULL);
15416             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15417                 SvPV_nolen_const(mysv),
15418                 REG_NODE_NUM(scan),
15419                 PL_reg_name[exact]);
15420         });
15421         if (temp == NULL)
15422             break;
15423         scan = temp;
15424     }
15425     DEBUG_PARSE_r({
15426         SV * const mysv_val=sv_newmortal();
15427         DEBUG_PARSE_MSG("");
15428         regprop(RExC_rx, mysv_val, val, NULL);
15429         PerlIO_printf(Perl_debug_log,
15430                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15431                       SvPV_nolen_const(mysv_val),
15432                       (IV)REG_NODE_NUM(val),
15433                       (IV)(val - scan)
15434         );
15435     });
15436     if (reg_off_by_arg[OP(scan)]) {
15437         ARG_SET(scan, val - scan);
15438     }
15439     else {
15440         NEXT_OFF(scan) = val - scan;
15441     }
15442
15443     return exact;
15444 }
15445 #endif
15446
15447 /*
15448  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15449  */
15450 #ifdef DEBUGGING
15451
15452 static void
15453 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15454 {
15455     int bit;
15456     int set=0;
15457
15458     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15459
15460     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15461         if (flags & (1<<bit)) {
15462             if (!set++ && lead)
15463                 PerlIO_printf(Perl_debug_log, "%s",lead);
15464             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15465         }
15466     }
15467     if (lead)  {
15468         if (set)
15469             PerlIO_printf(Perl_debug_log, "\n");
15470         else
15471             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15472     }
15473 }
15474
15475 static void
15476 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15477 {
15478     int bit;
15479     int set=0;
15480     regex_charset cs;
15481
15482     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15483
15484     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15485         if (flags & (1<<bit)) {
15486             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15487                 continue;
15488             }
15489             if (!set++ && lead)
15490                 PerlIO_printf(Perl_debug_log, "%s",lead);
15491             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15492         }
15493     }
15494     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15495             if (!set++ && lead) {
15496                 PerlIO_printf(Perl_debug_log, "%s",lead);
15497             }
15498             switch (cs) {
15499                 case REGEX_UNICODE_CHARSET:
15500                     PerlIO_printf(Perl_debug_log, "UNICODE");
15501                     break;
15502                 case REGEX_LOCALE_CHARSET:
15503                     PerlIO_printf(Perl_debug_log, "LOCALE");
15504                     break;
15505                 case REGEX_ASCII_RESTRICTED_CHARSET:
15506                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15507                     break;
15508                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15509                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15510                     break;
15511                 default:
15512                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15513                     break;
15514             }
15515     }
15516     if (lead)  {
15517         if (set)
15518             PerlIO_printf(Perl_debug_log, "\n");
15519         else
15520             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15521     }
15522 }
15523 #endif
15524
15525 void
15526 Perl_regdump(pTHX_ const regexp *r)
15527 {
15528 #ifdef DEBUGGING
15529     dVAR;
15530     SV * const sv = sv_newmortal();
15531     SV *dsv= sv_newmortal();
15532     RXi_GET_DECL(r,ri);
15533     GET_RE_DEBUG_FLAGS_DECL;
15534
15535     PERL_ARGS_ASSERT_REGDUMP;
15536
15537     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15538
15539     /* Header fields of interest. */
15540     if (r->anchored_substr) {
15541         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15542             RE_SV_DUMPLEN(r->anchored_substr), 30);
15543         PerlIO_printf(Perl_debug_log,
15544                       "anchored %s%s at %"IVdf" ",
15545                       s, RE_SV_TAIL(r->anchored_substr),
15546                       (IV)r->anchored_offset);
15547     } else if (r->anchored_utf8) {
15548         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15549             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15550         PerlIO_printf(Perl_debug_log,
15551                       "anchored utf8 %s%s at %"IVdf" ",
15552                       s, RE_SV_TAIL(r->anchored_utf8),
15553                       (IV)r->anchored_offset);
15554     }
15555     if (r->float_substr) {
15556         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15557             RE_SV_DUMPLEN(r->float_substr), 30);
15558         PerlIO_printf(Perl_debug_log,
15559                       "floating %s%s at %"IVdf"..%"UVuf" ",
15560                       s, RE_SV_TAIL(r->float_substr),
15561                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15562     } else if (r->float_utf8) {
15563         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15564             RE_SV_DUMPLEN(r->float_utf8), 30);
15565         PerlIO_printf(Perl_debug_log,
15566                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15567                       s, RE_SV_TAIL(r->float_utf8),
15568                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15569     }
15570     if (r->check_substr || r->check_utf8)
15571         PerlIO_printf(Perl_debug_log,
15572                       (const char *)
15573                       (r->check_substr == r->float_substr
15574                        && r->check_utf8 == r->float_utf8
15575                        ? "(checking floating" : "(checking anchored"));
15576     if (r->intflags & PREGf_NOSCAN)
15577         PerlIO_printf(Perl_debug_log, " noscan");
15578     if (r->extflags & RXf_CHECK_ALL)
15579         PerlIO_printf(Perl_debug_log, " isall");
15580     if (r->check_substr || r->check_utf8)
15581         PerlIO_printf(Perl_debug_log, ") ");
15582
15583     if (ri->regstclass) {
15584         regprop(r, sv, ri->regstclass, NULL);
15585         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15586     }
15587     if (r->intflags & PREGf_ANCH) {
15588         PerlIO_printf(Perl_debug_log, "anchored");
15589         if (r->intflags & PREGf_ANCH_BOL)
15590             PerlIO_printf(Perl_debug_log, "(BOL)");
15591         if (r->intflags & PREGf_ANCH_MBOL)
15592             PerlIO_printf(Perl_debug_log, "(MBOL)");
15593         if (r->intflags & PREGf_ANCH_SBOL)
15594             PerlIO_printf(Perl_debug_log, "(SBOL)");
15595         if (r->intflags & PREGf_ANCH_GPOS)
15596             PerlIO_printf(Perl_debug_log, "(GPOS)");
15597         PerlIO_putc(Perl_debug_log, ' ');
15598     }
15599     if (r->intflags & PREGf_GPOS_SEEN)
15600         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15601     if (r->intflags & PREGf_SKIP)
15602         PerlIO_printf(Perl_debug_log, "plus ");
15603     if (r->intflags & PREGf_IMPLICIT)
15604         PerlIO_printf(Perl_debug_log, "implicit ");
15605     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15606     if (r->extflags & RXf_EVAL_SEEN)
15607         PerlIO_printf(Perl_debug_log, "with eval ");
15608     PerlIO_printf(Perl_debug_log, "\n");
15609     DEBUG_FLAGS_r({
15610         regdump_extflags("r->extflags: ",r->extflags);
15611         regdump_intflags("r->intflags: ",r->intflags);
15612     });
15613 #else
15614     PERL_ARGS_ASSERT_REGDUMP;
15615     PERL_UNUSED_CONTEXT;
15616     PERL_UNUSED_ARG(r);
15617 #endif  /* DEBUGGING */
15618 }
15619
15620 /*
15621 - regprop - printable representation of opcode, with run time support
15622 */
15623
15624 void
15625 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15626 {
15627 #ifdef DEBUGGING
15628     dVAR;
15629     int k;
15630
15631     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15632     static const char * const anyofs[] = {
15633 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15634     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15635     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15636     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15637     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15638     || _CC_VERTSPACE != 16
15639   #error Need to adjust order of anyofs[]
15640 #endif
15641         "\\w",
15642         "\\W",
15643         "\\d",
15644         "\\D",
15645         "[:alpha:]",
15646         "[:^alpha:]",
15647         "[:lower:]",
15648         "[:^lower:]",
15649         "[:upper:]",
15650         "[:^upper:]",
15651         "[:punct:]",
15652         "[:^punct:]",
15653         "[:print:]",
15654         "[:^print:]",
15655         "[:alnum:]",
15656         "[:^alnum:]",
15657         "[:graph:]",
15658         "[:^graph:]",
15659         "[:cased:]",
15660         "[:^cased:]",
15661         "\\s",
15662         "\\S",
15663         "[:blank:]",
15664         "[:^blank:]",
15665         "[:xdigit:]",
15666         "[:^xdigit:]",
15667         "[:space:]",
15668         "[:^space:]",
15669         "[:cntrl:]",
15670         "[:^cntrl:]",
15671         "[:ascii:]",
15672         "[:^ascii:]",
15673         "\\v",
15674         "\\V"
15675     };
15676     RXi_GET_DECL(prog,progi);
15677     GET_RE_DEBUG_FLAGS_DECL;
15678
15679     PERL_ARGS_ASSERT_REGPROP;
15680
15681     sv_setpvs(sv, "");
15682
15683     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15684         /* It would be nice to FAIL() here, but this may be called from
15685            regexec.c, and it would be hard to supply pRExC_state. */
15686         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15687                                               (int)OP(o), (int)REGNODE_MAX);
15688     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15689
15690     k = PL_regkind[OP(o)];
15691
15692     if (k == EXACT) {
15693         sv_catpvs(sv, " ");
15694         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15695          * is a crude hack but it may be the best for now since
15696          * we have no flag "this EXACTish node was UTF-8"
15697          * --jhi */
15698         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15699                   PERL_PV_ESCAPE_UNI_DETECT |
15700                   PERL_PV_ESCAPE_NONASCII   |
15701                   PERL_PV_PRETTY_ELLIPSES   |
15702                   PERL_PV_PRETTY_LTGT       |
15703                   PERL_PV_PRETTY_NOCLEAR
15704                   );
15705     } else if (k == TRIE) {
15706         /* print the details of the trie in dumpuntil instead, as
15707          * progi->data isn't available here */
15708         const char op = OP(o);
15709         const U32 n = ARG(o);
15710         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15711                (reg_ac_data *)progi->data->data[n] :
15712                NULL;
15713         const reg_trie_data * const trie
15714             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15715
15716         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15717         DEBUG_TRIE_COMPILE_r(
15718           Perl_sv_catpvf(aTHX_ sv,
15719             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15720             (UV)trie->startstate,
15721             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15722             (UV)trie->wordcount,
15723             (UV)trie->minlen,
15724             (UV)trie->maxlen,
15725             (UV)TRIE_CHARCOUNT(trie),
15726             (UV)trie->uniquecharcount
15727           );
15728         );
15729         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15730             sv_catpvs(sv, "[");
15731             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15732                                                    ? ANYOF_BITMAP(o)
15733                                                    : TRIE_BITMAP(trie));
15734             sv_catpvs(sv, "]");
15735         }
15736
15737     } else if (k == CURLY) {
15738         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15739             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15740         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15741     }
15742     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15743         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15744     else if (k == REF || k == OPEN || k == CLOSE
15745              || k == GROUPP || OP(o)==ACCEPT)
15746     {
15747         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15748         if ( RXp_PAREN_NAMES(prog) ) {
15749             if ( k != REF || (OP(o) < NREF)) {
15750                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15751                 SV **name= av_fetch(list, ARG(o), 0 );
15752                 if (name)
15753                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15754             }
15755             else {
15756                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15757                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15758                 I32 *nums=(I32*)SvPVX(sv_dat);
15759                 SV **name= av_fetch(list, nums[0], 0 );
15760                 I32 n;
15761                 if (name) {
15762                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15763                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15764                                     (n ? "," : ""), (IV)nums[n]);
15765                     }
15766                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15767                 }
15768             }
15769         }
15770         if ( k == REF && reginfo) {
15771             U32 n = ARG(o);  /* which paren pair */
15772             I32 ln = prog->offs[n].start;
15773             if (prog->lastparen < n || ln == -1)
15774                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15775             else if (ln == prog->offs[n].end)
15776                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15777             else {
15778                 const char *s = reginfo->strbeg + ln;
15779                 Perl_sv_catpvf(aTHX_ sv, ": ");
15780                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15781                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15782             }
15783         }
15784     } else if (k == GOSUB)
15785         /* Paren and offset */
15786         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15787     else if (k == VERB) {
15788         if (!o->flags)
15789             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15790                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15791     } else if (k == LOGICAL)
15792         /* 2: embedded, otherwise 1 */
15793         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15794     else if (k == ANYOF) {
15795         const U8 flags = ANYOF_FLAGS(o);
15796         int do_sep = 0;
15797
15798
15799         if (flags & ANYOF_LOCALE_FLAGS)
15800             sv_catpvs(sv, "{loc}");
15801         if (flags & ANYOF_LOC_FOLD)
15802             sv_catpvs(sv, "{i}");
15803         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15804         if (flags & ANYOF_INVERT)
15805             sv_catpvs(sv, "^");
15806
15807         /* output what the standard cp 0-255 bitmap matches */
15808         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15809
15810         /* output any special charclass tests (used entirely under use
15811          * locale) * */
15812         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15813             int i;
15814             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15815                 if (ANYOF_POSIXL_TEST(o,i)) {
15816                     sv_catpv(sv, anyofs[i]);
15817                     do_sep = 1;
15818                 }
15819             }
15820         }
15821
15822         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15823                       |ANYOF_UTF8
15824                       |ANYOF_NONBITMAP_NON_UTF8
15825                       |ANYOF_LOC_FOLD)))
15826         {
15827             if (do_sep) {
15828                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15829                 if (flags & ANYOF_INVERT)
15830                     /*make sure the invert info is in each */
15831                     sv_catpvs(sv, "^");
15832             }
15833
15834             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15835                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15836             }
15837
15838             /* output information about the unicode matching */
15839             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15840                 sv_catpvs(sv, "{unicode_all}");
15841             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15842                 SV *lv; /* Set if there is something outside the bit map. */
15843                 bool byte_output = FALSE;   /* If something in the bitmap has
15844                                                been output */
15845                 SV *only_utf8_locale;
15846
15847                 /* Get the stuff that wasn't in the bitmap */
15848                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15849                                                     &lv, &only_utf8_locale);
15850                 if (lv && lv != &PL_sv_undef) {
15851                     char *s = savesvpv(lv);
15852                     char * const origs = s;
15853
15854                     while (*s && *s != '\n')
15855                         s++;
15856
15857                     if (*s == '\n') {
15858                         const char * const t = ++s;
15859
15860                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15861                             sv_catpvs(sv, "{outside bitmap}");
15862                         }
15863                         else {
15864                             sv_catpvs(sv, "{utf8}");
15865                         }
15866
15867                         if (byte_output) {
15868                             sv_catpvs(sv, " ");
15869                         }
15870
15871                         while (*s) {
15872                             if (*s == '\n') {
15873
15874                                 /* Truncate very long output */
15875                                 if (s - origs > 256) {
15876                                     Perl_sv_catpvf(aTHX_ sv,
15877                                                 "%.*s...",
15878                                                 (int) (s - origs - 1),
15879                                                 t);
15880                                     goto out_dump;
15881                                 }
15882                                 *s = ' ';
15883                             }
15884                             else if (*s == '\t') {
15885                                 *s = '-';
15886                             }
15887                             s++;
15888                         }
15889                         if (s[-1] == ' ')
15890                             s[-1] = 0;
15891
15892                         sv_catpv(sv, t);
15893                     }
15894
15895                 out_dump:
15896
15897                     Safefree(origs);
15898                     SvREFCNT_dec_NN(lv);
15899                 }
15900
15901                 if ((flags & ANYOF_LOC_FOLD)
15902                      && only_utf8_locale
15903                      && only_utf8_locale != &PL_sv_undef)
15904                 {
15905                     UV start, end;
15906                     int max_entries = 256;
15907
15908                     sv_catpvs(sv, "{utf8 locale}");
15909                     invlist_iterinit(only_utf8_locale);
15910                     while (invlist_iternext(only_utf8_locale,
15911                                             &start, &end)) {
15912                         put_range(sv, start, end);
15913                         max_entries --;
15914                         if (max_entries < 0) {
15915                             sv_catpvs(sv, "...");
15916                             break;
15917                         }
15918                     }
15919                     invlist_iterfinish(only_utf8_locale);
15920                 }
15921             }
15922         }
15923
15924         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15925     }
15926     else if (k == POSIXD || k == NPOSIXD) {
15927         U8 index = FLAGS(o) * 2;
15928         if (index < C_ARRAY_LENGTH(anyofs)) {
15929             if (*anyofs[index] != '[')  {
15930                 sv_catpv(sv, "[");
15931             }
15932             sv_catpv(sv, anyofs[index]);
15933             if (*anyofs[index] != '[')  {
15934                 sv_catpv(sv, "]");
15935             }
15936         }
15937         else {
15938             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15939         }
15940     }
15941     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15942         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15943 #else
15944     PERL_UNUSED_CONTEXT;
15945     PERL_UNUSED_ARG(sv);
15946     PERL_UNUSED_ARG(o);
15947     PERL_UNUSED_ARG(prog);
15948     PERL_UNUSED_ARG(reginfo);
15949 #endif  /* DEBUGGING */
15950 }
15951
15952
15953
15954 SV *
15955 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15956 {                               /* Assume that RE_INTUIT is set */
15957     dVAR;
15958     struct regexp *const prog = ReANY(r);
15959     GET_RE_DEBUG_FLAGS_DECL;
15960
15961     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15962     PERL_UNUSED_CONTEXT;
15963
15964     DEBUG_COMPILE_r(
15965         {
15966             const char * const s = SvPV_nolen_const(prog->check_substr
15967                       ? prog->check_substr : prog->check_utf8);
15968
15969             if (!PL_colorset) reginitcolors();
15970             PerlIO_printf(Perl_debug_log,
15971                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15972                       PL_colors[4],
15973                       prog->check_substr ? "" : "utf8 ",
15974                       PL_colors[5],PL_colors[0],
15975                       s,
15976                       PL_colors[1],
15977                       (strlen(s) > 60 ? "..." : ""));
15978         } );
15979
15980     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15981 }
15982
15983 /*
15984    pregfree()
15985
15986    handles refcounting and freeing the perl core regexp structure. When
15987    it is necessary to actually free the structure the first thing it
15988    does is call the 'free' method of the regexp_engine associated to
15989    the regexp, allowing the handling of the void *pprivate; member
15990    first. (This routine is not overridable by extensions, which is why
15991    the extensions free is called first.)
15992
15993    See regdupe and regdupe_internal if you change anything here.
15994 */
15995 #ifndef PERL_IN_XSUB_RE
15996 void
15997 Perl_pregfree(pTHX_ REGEXP *r)
15998 {
15999     SvREFCNT_dec(r);
16000 }
16001
16002 void
16003 Perl_pregfree2(pTHX_ REGEXP *rx)
16004 {
16005     dVAR;
16006     struct regexp *const r = ReANY(rx);
16007     GET_RE_DEBUG_FLAGS_DECL;
16008
16009     PERL_ARGS_ASSERT_PREGFREE2;
16010
16011     if (r->mother_re) {
16012         ReREFCNT_dec(r->mother_re);
16013     } else {
16014         CALLREGFREE_PVT(rx); /* free the private data */
16015         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16016         Safefree(r->xpv_len_u.xpvlenu_pv);
16017     }
16018     if (r->substrs) {
16019         SvREFCNT_dec(r->anchored_substr);
16020         SvREFCNT_dec(r->anchored_utf8);
16021         SvREFCNT_dec(r->float_substr);
16022         SvREFCNT_dec(r->float_utf8);
16023         Safefree(r->substrs);
16024     }
16025     RX_MATCH_COPY_FREE(rx);
16026 #ifdef PERL_ANY_COW
16027     SvREFCNT_dec(r->saved_copy);
16028 #endif
16029     Safefree(r->offs);
16030     SvREFCNT_dec(r->qr_anoncv);
16031     rx->sv_u.svu_rx = 0;
16032 }
16033
16034 /*  reg_temp_copy()
16035
16036     This is a hacky workaround to the structural issue of match results
16037     being stored in the regexp structure which is in turn stored in
16038     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16039     could be PL_curpm in multiple contexts, and could require multiple
16040     result sets being associated with the pattern simultaneously, such
16041     as when doing a recursive match with (??{$qr})
16042
16043     The solution is to make a lightweight copy of the regexp structure
16044     when a qr// is returned from the code executed by (??{$qr}) this
16045     lightweight copy doesn't actually own any of its data except for
16046     the starp/end and the actual regexp structure itself.
16047
16048 */
16049
16050
16051 REGEXP *
16052 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16053 {
16054     struct regexp *ret;
16055     struct regexp *const r = ReANY(rx);
16056     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16057
16058     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16059
16060     if (!ret_x)
16061         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16062     else {
16063         SvOK_off((SV *)ret_x);
16064         if (islv) {
16065             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16066                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16067                made both spots point to the same regexp body.) */
16068             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16069             assert(!SvPVX(ret_x));
16070             ret_x->sv_u.svu_rx = temp->sv_any;
16071             temp->sv_any = NULL;
16072             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16073             SvREFCNT_dec_NN(temp);
16074             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16075                ing below will not set it. */
16076             SvCUR_set(ret_x, SvCUR(rx));
16077         }
16078     }
16079     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16080        sv_force_normal(sv) is called.  */
16081     SvFAKE_on(ret_x);
16082     ret = ReANY(ret_x);
16083
16084     SvFLAGS(ret_x) |= SvUTF8(rx);
16085     /* We share the same string buffer as the original regexp, on which we
16086        hold a reference count, incremented when mother_re is set below.
16087        The string pointer is copied here, being part of the regexp struct.
16088      */
16089     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16090            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16091     if (r->offs) {
16092         const I32 npar = r->nparens+1;
16093         Newx(ret->offs, npar, regexp_paren_pair);
16094         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16095     }
16096     if (r->substrs) {
16097         Newx(ret->substrs, 1, struct reg_substr_data);
16098         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16099
16100         SvREFCNT_inc_void(ret->anchored_substr);
16101         SvREFCNT_inc_void(ret->anchored_utf8);
16102         SvREFCNT_inc_void(ret->float_substr);
16103         SvREFCNT_inc_void(ret->float_utf8);
16104
16105         /* check_substr and check_utf8, if non-NULL, point to either their
16106            anchored or float namesakes, and don't hold a second reference.  */
16107     }
16108     RX_MATCH_COPIED_off(ret_x);
16109 #ifdef PERL_ANY_COW
16110     ret->saved_copy = NULL;
16111 #endif
16112     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16113     SvREFCNT_inc_void(ret->qr_anoncv);
16114
16115     return ret_x;
16116 }
16117 #endif
16118
16119 /* regfree_internal()
16120
16121    Free the private data in a regexp. This is overloadable by
16122    extensions. Perl takes care of the regexp structure in pregfree(),
16123    this covers the *pprivate pointer which technically perl doesn't
16124    know about, however of course we have to handle the
16125    regexp_internal structure when no extension is in use.
16126
16127    Note this is called before freeing anything in the regexp
16128    structure.
16129  */
16130
16131 void
16132 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16133 {
16134     dVAR;
16135     struct regexp *const r = ReANY(rx);
16136     RXi_GET_DECL(r,ri);
16137     GET_RE_DEBUG_FLAGS_DECL;
16138
16139     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16140
16141     DEBUG_COMPILE_r({
16142         if (!PL_colorset)
16143             reginitcolors();
16144         {
16145             SV *dsv= sv_newmortal();
16146             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16147                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16148             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16149                 PL_colors[4],PL_colors[5],s);
16150         }
16151     });
16152 #ifdef RE_TRACK_PATTERN_OFFSETS
16153     if (ri->u.offsets)
16154         Safefree(ri->u.offsets);             /* 20010421 MJD */
16155 #endif
16156     if (ri->code_blocks) {
16157         int n;
16158         for (n = 0; n < ri->num_code_blocks; n++)
16159             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16160         Safefree(ri->code_blocks);
16161     }
16162
16163     if (ri->data) {
16164         int n = ri->data->count;
16165
16166         while (--n >= 0) {
16167           /* If you add a ->what type here, update the comment in regcomp.h */
16168             switch (ri->data->what[n]) {
16169             case 'a':
16170             case 'r':
16171             case 's':
16172             case 'S':
16173             case 'u':
16174                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16175                 break;
16176             case 'f':
16177                 Safefree(ri->data->data[n]);
16178                 break;
16179             case 'l':
16180             case 'L':
16181                 break;
16182             case 'T':
16183                 { /* Aho Corasick add-on structure for a trie node.
16184                      Used in stclass optimization only */
16185                     U32 refcount;
16186                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16187                     OP_REFCNT_LOCK;
16188                     refcount = --aho->refcount;
16189                     OP_REFCNT_UNLOCK;
16190                     if ( !refcount ) {
16191                         PerlMemShared_free(aho->states);
16192                         PerlMemShared_free(aho->fail);
16193                          /* do this last!!!! */
16194                         PerlMemShared_free(ri->data->data[n]);
16195                         /* we should only ever get called once, so
16196                          * assert as much, and also guard the free
16197                          * which /might/ happen twice. At the least
16198                          * it will make code anlyzers happy and it
16199                          * doesn't cost much. - Yves */
16200                         assert(ri->regstclass);
16201                         if (ri->regstclass) {
16202                             PerlMemShared_free(ri->regstclass);
16203                             ri->regstclass = 0;
16204                         }
16205                     }
16206                 }
16207                 break;
16208             case 't':
16209                 {
16210                     /* trie structure. */
16211                     U32 refcount;
16212                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16213                     OP_REFCNT_LOCK;
16214                     refcount = --trie->refcount;
16215                     OP_REFCNT_UNLOCK;
16216                     if ( !refcount ) {
16217                         PerlMemShared_free(trie->charmap);
16218                         PerlMemShared_free(trie->states);
16219                         PerlMemShared_free(trie->trans);
16220                         if (trie->bitmap)
16221                             PerlMemShared_free(trie->bitmap);
16222                         if (trie->jump)
16223                             PerlMemShared_free(trie->jump);
16224                         PerlMemShared_free(trie->wordinfo);
16225                         /* do this last!!!! */
16226                         PerlMemShared_free(ri->data->data[n]);
16227                     }
16228                 }
16229                 break;
16230             default:
16231                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16232                                                     ri->data->what[n]);
16233             }
16234         }
16235         Safefree(ri->data->what);
16236         Safefree(ri->data);
16237     }
16238
16239     Safefree(ri);
16240 }
16241
16242 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16243 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16244 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16245
16246 /*
16247    re_dup - duplicate a regexp.
16248
16249    This routine is expected to clone a given regexp structure. It is only
16250    compiled under USE_ITHREADS.
16251
16252    After all of the core data stored in struct regexp is duplicated
16253    the regexp_engine.dupe method is used to copy any private data
16254    stored in the *pprivate pointer. This allows extensions to handle
16255    any duplication it needs to do.
16256
16257    See pregfree() and regfree_internal() if you change anything here.
16258 */
16259 #if defined(USE_ITHREADS)
16260 #ifndef PERL_IN_XSUB_RE
16261 void
16262 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16263 {
16264     dVAR;
16265     I32 npar;
16266     const struct regexp *r = ReANY(sstr);
16267     struct regexp *ret = ReANY(dstr);
16268
16269     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16270
16271     npar = r->nparens+1;
16272     Newx(ret->offs, npar, regexp_paren_pair);
16273     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16274
16275     if (ret->substrs) {
16276         /* Do it this way to avoid reading from *r after the StructCopy().
16277            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16278            cache, it doesn't matter.  */
16279         const bool anchored = r->check_substr
16280             ? r->check_substr == r->anchored_substr
16281             : r->check_utf8 == r->anchored_utf8;
16282         Newx(ret->substrs, 1, struct reg_substr_data);
16283         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16284
16285         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16286         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16287         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16288         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16289
16290         /* check_substr and check_utf8, if non-NULL, point to either their
16291            anchored or float namesakes, and don't hold a second reference.  */
16292
16293         if (ret->check_substr) {
16294             if (anchored) {
16295                 assert(r->check_utf8 == r->anchored_utf8);
16296                 ret->check_substr = ret->anchored_substr;
16297                 ret->check_utf8 = ret->anchored_utf8;
16298             } else {
16299                 assert(r->check_substr == r->float_substr);
16300                 assert(r->check_utf8 == r->float_utf8);
16301                 ret->check_substr = ret->float_substr;
16302                 ret->check_utf8 = ret->float_utf8;
16303             }
16304         } else if (ret->check_utf8) {
16305             if (anchored) {
16306                 ret->check_utf8 = ret->anchored_utf8;
16307             } else {
16308                 ret->check_utf8 = ret->float_utf8;
16309             }
16310         }
16311     }
16312
16313     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16314     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16315
16316     if (ret->pprivate)
16317         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16318
16319     if (RX_MATCH_COPIED(dstr))
16320         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16321     else
16322         ret->subbeg = NULL;
16323 #ifdef PERL_ANY_COW
16324     ret->saved_copy = NULL;
16325 #endif
16326
16327     /* Whether mother_re be set or no, we need to copy the string.  We
16328        cannot refrain from copying it when the storage points directly to
16329        our mother regexp, because that's
16330                1: a buffer in a different thread
16331                2: something we no longer hold a reference on
16332                so we need to copy it locally.  */
16333     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16334     ret->mother_re   = NULL;
16335 }
16336 #endif /* PERL_IN_XSUB_RE */
16337
16338 /*
16339    regdupe_internal()
16340
16341    This is the internal complement to regdupe() which is used to copy
16342    the structure pointed to by the *pprivate pointer in the regexp.
16343    This is the core version of the extension overridable cloning hook.
16344    The regexp structure being duplicated will be copied by perl prior
16345    to this and will be provided as the regexp *r argument, however
16346    with the /old/ structures pprivate pointer value. Thus this routine
16347    may override any copying normally done by perl.
16348
16349    It returns a pointer to the new regexp_internal structure.
16350 */
16351
16352 void *
16353 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16354 {
16355     dVAR;
16356     struct regexp *const r = ReANY(rx);
16357     regexp_internal *reti;
16358     int len;
16359     RXi_GET_DECL(r,ri);
16360
16361     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16362
16363     len = ProgLen(ri);
16364
16365     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16366           char, regexp_internal);
16367     Copy(ri->program, reti->program, len+1, regnode);
16368
16369     reti->num_code_blocks = ri->num_code_blocks;
16370     if (ri->code_blocks) {
16371         int n;
16372         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16373                 struct reg_code_block);
16374         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16375                 struct reg_code_block);
16376         for (n = 0; n < ri->num_code_blocks; n++)
16377              reti->code_blocks[n].src_regex = (REGEXP*)
16378                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16379     }
16380     else
16381         reti->code_blocks = NULL;
16382
16383     reti->regstclass = NULL;
16384
16385     if (ri->data) {
16386         struct reg_data *d;
16387         const int count = ri->data->count;
16388         int i;
16389
16390         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16391                 char, struct reg_data);
16392         Newx(d->what, count, U8);
16393
16394         d->count = count;
16395         for (i = 0; i < count; i++) {
16396             d->what[i] = ri->data->what[i];
16397             switch (d->what[i]) {
16398                 /* see also regcomp.h and regfree_internal() */
16399             case 'a': /* actually an AV, but the dup function is identical.  */
16400             case 'r':
16401             case 's':
16402             case 'S':
16403             case 'u': /* actually an HV, but the dup function is identical.  */
16404                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16405                 break;
16406             case 'f':
16407                 /* This is cheating. */
16408                 Newx(d->data[i], 1, regnode_ssc);
16409                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16410                 reti->regstclass = (regnode*)d->data[i];
16411                 break;
16412             case 'T':
16413                 /* Trie stclasses are readonly and can thus be shared
16414                  * without duplication. We free the stclass in pregfree
16415                  * when the corresponding reg_ac_data struct is freed.
16416                  */
16417                 reti->regstclass= ri->regstclass;
16418                 /* FALLTHROUGH */
16419             case 't':
16420                 OP_REFCNT_LOCK;
16421                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16422                 OP_REFCNT_UNLOCK;
16423                 /* FALLTHROUGH */
16424             case 'l':
16425             case 'L':
16426                 d->data[i] = ri->data->data[i];
16427                 break;
16428             default:
16429                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16430                                                            ri->data->what[i]);
16431             }
16432         }
16433
16434         reti->data = d;
16435     }
16436     else
16437         reti->data = NULL;
16438
16439     reti->name_list_idx = ri->name_list_idx;
16440
16441 #ifdef RE_TRACK_PATTERN_OFFSETS
16442     if (ri->u.offsets) {
16443         Newx(reti->u.offsets, 2*len+1, U32);
16444         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16445     }
16446 #else
16447     SetProgLen(reti,len);
16448 #endif
16449
16450     return (void*)reti;
16451 }
16452
16453 #endif    /* USE_ITHREADS */
16454
16455 #ifndef PERL_IN_XSUB_RE
16456
16457 /*
16458  - regnext - dig the "next" pointer out of a node
16459  */
16460 regnode *
16461 Perl_regnext(pTHX_ regnode *p)
16462 {
16463     dVAR;
16464     I32 offset;
16465
16466     if (!p)
16467         return(NULL);
16468
16469     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16470         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16471                                                 (int)OP(p), (int)REGNODE_MAX);
16472     }
16473
16474     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16475     if (offset == 0)
16476         return(NULL);
16477
16478     return(p+offset);
16479 }
16480 #endif
16481
16482 STATIC void
16483 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16484 {
16485     va_list args;
16486     STRLEN l1 = strlen(pat1);
16487     STRLEN l2 = strlen(pat2);
16488     char buf[512];
16489     SV *msv;
16490     const char *message;
16491
16492     PERL_ARGS_ASSERT_RE_CROAK2;
16493
16494     if (l1 > 510)
16495         l1 = 510;
16496     if (l1 + l2 > 510)
16497         l2 = 510 - l1;
16498     Copy(pat1, buf, l1 , char);
16499     Copy(pat2, buf + l1, l2 , char);
16500     buf[l1 + l2] = '\n';
16501     buf[l1 + l2 + 1] = '\0';
16502     va_start(args, pat2);
16503     msv = vmess(buf, &args);
16504     va_end(args);
16505     message = SvPV_const(msv,l1);
16506     if (l1 > 512)
16507         l1 = 512;
16508     Copy(message, buf, l1 , char);
16509     /* l1-1 to avoid \n */
16510     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16511 }
16512
16513 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16514
16515 #ifndef PERL_IN_XSUB_RE
16516 void
16517 Perl_save_re_context(pTHX)
16518 {
16519     dVAR;
16520
16521     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16522     if (PL_curpm) {
16523         const REGEXP * const rx = PM_GETRE(PL_curpm);
16524         if (rx) {
16525             U32 i;
16526             for (i = 1; i <= RX_NPARENS(rx); i++) {
16527                 char digits[TYPE_CHARS(long)];
16528                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16529                                                "%lu", (long)i);
16530                 GV *const *const gvp
16531                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16532
16533                 if (gvp) {
16534                     GV * const gv = *gvp;
16535                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16536                         save_scalar(gv);
16537                 }
16538             }
16539         }
16540     }
16541 }
16542 #endif
16543
16544 #ifdef DEBUGGING
16545
16546 STATIC void
16547 S_put_byte(pTHX_ SV *sv, int c)
16548 {
16549     PERL_ARGS_ASSERT_PUT_BYTE;
16550
16551     if (!isPRINT(c)) {
16552         switch (c) {
16553             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16554             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16555             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16556             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16557             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16558
16559             default:
16560                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16561                 break;
16562         }
16563     }
16564     else {
16565         const char string = c;
16566         if (c == '-' || c == ']' || c == '\\' || c == '^')
16567             sv_catpvs(sv, "\\");
16568         sv_catpvn(sv, &string, 1);
16569     }
16570 }
16571
16572 STATIC void
16573 S_put_range(pTHX_ SV *sv, UV start, UV end)
16574 {
16575
16576     /* Appends to 'sv' a displayable version of the range of code points from
16577      * 'start' to 'end' */
16578
16579     assert(start <= end);
16580
16581     PERL_ARGS_ASSERT_PUT_RANGE;
16582
16583     if (end - start < 3) {  /* Individual chars in short ranges */
16584         for (; start <= end; start++)
16585             put_byte(sv, start);
16586     }
16587     else if (   end > 255
16588              || ! isALPHANUMERIC(start)
16589              || ! isALPHANUMERIC(end)
16590              || isDIGIT(start) != isDIGIT(end)
16591              || isUPPER(start) != isUPPER(end)
16592              || isLOWER(start) != isLOWER(end)
16593
16594                 /* This final test should get optimized out except on EBCDIC
16595                  * platforms, where it causes ranges that cross discontinuities
16596                  * like i/j to be shown as hex instead of the misleading,
16597                  * e.g. H-K (since that range includes more than H, I, J, K).
16598                  * */
16599              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16600     {
16601         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16602                        start,
16603                        (end < 256) ? end : 255);
16604     }
16605     else { /* Here, the ends of the range are both digits, or both uppercase,
16606               or both lowercase; and there's no discontinuity in the range
16607               (which could happen on EBCDIC platforms) */
16608         put_byte(sv, start);
16609         sv_catpvs(sv, "-");
16610         put_byte(sv, end);
16611     }
16612 }
16613
16614 STATIC bool
16615 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16616 {
16617     /* Appends to 'sv' a displayable version of the innards of the bracketed
16618      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16619      * output anything */
16620
16621     int i;
16622     bool has_output_anything = FALSE;
16623
16624     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16625
16626     for (i = 0; i < 256; i++) {
16627         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16628
16629             /* The character at index i should be output.  Find the next
16630              * character that should NOT be output */
16631             int j;
16632             for (j = i + 1; j <= 256; j++) {
16633                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16634                     break;
16635                 }
16636             }
16637
16638             /* Everything between them is a single range that should be output
16639              * */
16640             put_range(sv, i, j - 1);
16641             has_output_anything = TRUE;
16642             i = j;
16643         }
16644     }
16645
16646     return has_output_anything;
16647 }
16648
16649 #define CLEAR_OPTSTART \
16650     if (optstart) STMT_START {                                               \
16651         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16652                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16653         optstart=NULL;                                                       \
16654     } STMT_END
16655
16656 #define DUMPUNTIL(b,e)                                                       \
16657                     CLEAR_OPTSTART;                                          \
16658                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16659
16660 STATIC const regnode *
16661 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16662             const regnode *last, const regnode *plast,
16663             SV* sv, I32 indent, U32 depth)
16664 {
16665     dVAR;
16666     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16667     const regnode *next;
16668     const regnode *optstart= NULL;
16669
16670     RXi_GET_DECL(r,ri);
16671     GET_RE_DEBUG_FLAGS_DECL;
16672
16673     PERL_ARGS_ASSERT_DUMPUNTIL;
16674
16675 #ifdef DEBUG_DUMPUNTIL
16676     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16677         last ? last-start : 0,plast ? plast-start : 0);
16678 #endif
16679
16680     if (plast && plast < last)
16681         last= plast;
16682
16683     while (PL_regkind[op] != END && (!last || node < last)) {
16684         assert(node);
16685         /* While that wasn't END last time... */
16686         NODE_ALIGN(node);
16687         op = OP(node);
16688         if (op == CLOSE || op == WHILEM)
16689             indent--;
16690         next = regnext((regnode *)node);
16691
16692         /* Where, what. */
16693         if (OP(node) == OPTIMIZED) {
16694             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16695                 optstart = node;
16696             else
16697                 goto after_print;
16698         } else
16699             CLEAR_OPTSTART;
16700
16701         regprop(r, sv, node, NULL);
16702         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16703                       (int)(2*indent + 1), "", SvPVX_const(sv));
16704
16705         if (OP(node) != OPTIMIZED) {
16706             if (next == NULL)           /* Next ptr. */
16707                 PerlIO_printf(Perl_debug_log, " (0)");
16708             else if (PL_regkind[(U8)op] == BRANCH
16709                      && PL_regkind[OP(next)] != BRANCH )
16710                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16711             else
16712                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16713             (void)PerlIO_putc(Perl_debug_log, '\n');
16714         }
16715
16716       after_print:
16717         if (PL_regkind[(U8)op] == BRANCHJ) {
16718             assert(next);
16719             {
16720                 const regnode *nnode = (OP(next) == LONGJMP
16721                                        ? regnext((regnode *)next)
16722                                        : next);
16723                 if (last && nnode > last)
16724                     nnode = last;
16725                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16726             }
16727         }
16728         else if (PL_regkind[(U8)op] == BRANCH) {
16729             assert(next);
16730             DUMPUNTIL(NEXTOPER(node), next);
16731         }
16732         else if ( PL_regkind[(U8)op]  == TRIE ) {
16733             const regnode *this_trie = node;
16734             const char op = OP(node);
16735             const U32 n = ARG(node);
16736             const reg_ac_data * const ac = op>=AHOCORASICK ?
16737                (reg_ac_data *)ri->data->data[n] :
16738                NULL;
16739             const reg_trie_data * const trie =
16740                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16741 #ifdef DEBUGGING
16742             AV *const trie_words
16743                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16744 #endif
16745             const regnode *nextbranch= NULL;
16746             I32 word_idx;
16747             sv_setpvs(sv, "");
16748             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16749                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16750
16751                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16752                    (int)(2*(indent+3)), "",
16753                     elem_ptr
16754                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16755                                 SvCUR(*elem_ptr), 60,
16756                                 PL_colors[0], PL_colors[1],
16757                                 (SvUTF8(*elem_ptr)
16758                                  ? PERL_PV_ESCAPE_UNI
16759                                  : 0)
16760                                 | PERL_PV_PRETTY_ELLIPSES
16761                                 | PERL_PV_PRETTY_LTGT
16762                             )
16763                     : "???"
16764                 );
16765                 if (trie->jump) {
16766                     U16 dist= trie->jump[word_idx+1];
16767                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16768                                (UV)((dist ? this_trie + dist : next) - start));
16769                     if (dist) {
16770                         if (!nextbranch)
16771                             nextbranch= this_trie + trie->jump[0];
16772                         DUMPUNTIL(this_trie + dist, nextbranch);
16773                     }
16774                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16775                         nextbranch= regnext((regnode *)nextbranch);
16776                 } else {
16777                     PerlIO_printf(Perl_debug_log, "\n");
16778                 }
16779             }
16780             if (last && next > last)
16781                 node= last;
16782             else
16783                 node= next;
16784         }
16785         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16786             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16787                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16788         }
16789         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16790             assert(next);
16791             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16792         }
16793         else if ( op == PLUS || op == STAR) {
16794             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16795         }
16796         else if (PL_regkind[(U8)op] == ANYOF) {
16797             /* arglen 1 + class block */
16798             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16799                           ? ANYOF_POSIXL_SKIP
16800                           : ANYOF_SKIP);
16801             node = NEXTOPER(node);
16802         }
16803         else if (PL_regkind[(U8)op] == EXACT) {
16804             /* Literal string, where present. */
16805             node += NODE_SZ_STR(node) - 1;
16806             node = NEXTOPER(node);
16807         }
16808         else {
16809             node = NEXTOPER(node);
16810             node += regarglen[(U8)op];
16811         }
16812         if (op == CURLYX || op == OPEN)
16813             indent++;
16814     }
16815     CLEAR_OPTSTART;
16816 #ifdef DEBUG_DUMPUNTIL
16817     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16818 #endif
16819     return node;
16820 }
16821
16822 #endif  /* DEBUGGING */
16823
16824 /*
16825  * Local variables:
16826  * c-indentation-style: bsd
16827  * c-basic-offset: 4
16828  * indent-tabs-mode: nil
16829  * End:
16830  *
16831  * ex: set ts=8 sts=4 sw=4 et:
16832  */