regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105     U32         flags;                  /* RXf_* are we folding, multilining? */
 106     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object
 111                                            pprivate field */
 112     char        *start;                 /* Start of input for compile */
 113     char        *end;                   /* End of input for compile */
 114     char        *parse;                 /* Input-scan pointer. */
 115     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 116     regnode     *emit_start;            /* Start of emitted-code area */
 117     regnode     *emit_bound;            /* First regnode outside of the
 118                                            allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 120                                            implies compiling, so don't emit */
 121     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 122                                            large enough for the largest
 123                                            non-EXACTish node, so can use it as
 124                                            scratch in pass1 */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     SSize_t     size;                   /* Code size. */
 129     I32                npar;            /* Capture buffer count, (OPEN) plus
 130                                            one. ("par" 0 is the whole
 131                                            pattern)*/
 132     I32         nestroot;               /* root parens we are in - used by
 133                                            accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151                                            through */
 152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         contains_i;
 156     I32         override_recoding;
 157     I32         in_multi_char_class;
 158     struct reg_code_block *code_blocks; /* positions of literal (?{})
 159                                             within pattern */
 160     int         num_code_blocks;        /* size of code_blocks[] */
 161     int         code_index;             /* next code_blocks[] slot */
 162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 190                                                          others */
 191 #endif
 192 #define RExC_emit       (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty    (pRExC_state->naughty)
 197 #define RExC_sawback    (pRExC_state->sawback)
 198 #define RExC_seen       (pRExC_state->seen)
 199 #define RExC_size       (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar       (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen   (pRExC_state->extralen)
 204 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 205 #define RExC_utf8       (pRExC_state->utf8)
 206 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 208 #define RExC_open_parens        (pRExC_state->open_parens)
 209 #define RExC_close_parens       (pRExC_state->close_parens)
 210 #define RExC_opend      (pRExC_state->opend)
 211 #define RExC_paren_names        (pRExC_state->paren_names)
 212 #define RExC_recurse    (pRExC_state->recurse)
 213 #define RExC_recurse_count      (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216                                    (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale    (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226         ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8    STMT_START {                                       \
 262                                      if (!UTF) {                           \
 263                                          *flagp = RESTART_UTF8;            \
 264                                          return NULL;                      \
 265                                      }                                     \
 266                         } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288     /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296      /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303     This is the position the string must appear at, or not before.
 304     It also implicitly (when combined with minlenp) tells us how many
 305     characters must match before the string we are searching for.
 306     Likewise when combined with minlenp and the length of the string it
 307     tells us how many characters must appear after the string we have
 308     found.
 309
 310   - max_offset
 311     Only used for floating strings. This is the rightmost point that
 312     the string can appear at. If set to SSize_t_MAX it indicates that the
 313     string can occur infinitely far to the right.
 314
 315   - minlenp
 316     A pointer to the minimum number of characters of the pattern that the
 317     string was found inside. This is important as in the case of positive
 318     lookahead or positive lookbehind we can have multiple patterns
 319     involved. Consider
 320
 321     /(?=FOO).*F/
 322
 323     The minimum length of the pattern overall is 3, the minimum length
 324     of the lookahead part is 3, but the minimum length of the part that
 325     will actually match is 1. So 'FOO's minimum length is 3, but the
 326     minimum length for the F is 1. This is important as the minimum length
 327     is used to determine offsets in front of and behind the string being
 328     looked for.  Since strings can be composites this is the length of the
 329     pattern at the time it was committed with a scan_commit. Note that
 330     the length is calculated by study_chunk, so that the minimum lengths
 331     are not known until the full pattern has been compiled, thus the
 332     pointer to the value.
 333
 334   - lookbehind
 335
 336     In the case of lookbehind the string being searched for can be
 337     offset past the start point of the final matching string.
 338     If this value was just blithely removed from the min_offset it would
 339     invalidate some of the calculations for how many chars must match
 340     before or after (as they are derived from min_offset and minlen and
 341     the length of the string being searched for).
 342     When the final pattern is compiled and the data is moved from the
 343     scan_data_t structure into the regexp structure the information
 344     about lookbehind is factored in, with the information that would
 345     have been lost precalculated in the end_shift field for the
 346     associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354     /*I32 len_min;      unused */
 355     /*I32 len_delta;    unused */
 356     SSize_t pos_min;
 357     SSize_t pos_delta;
 358     SV *last_found;
 359     SSize_t last_end;       /* min value, <0 unless valid. */
 360     SSize_t last_start_min;
 361     SSize_t last_start_max;
 362     SV **longest;           /* Either &l_fixed, or &l_float. */
 363     SV *longest_fixed;      /* longest fixed string found in pattern */
 364     SSize_t offset_fixed;   /* offset where it starts */
 365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367     SV *longest_float;      /* longest floating string found in pattern */
 368     SSize_t offset_float_min; /* earliest point in string it can appear */
 369     SSize_t offset_float_max; /* latest point in string it can appear */
 370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372     I32 flags;
 373     I32 whilem_c;
 374     SSize_t *last_closep;
 375     regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL          0x0001
 405 #define SF_BEFORE_MEOL          0x0002
 406 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL        (+2)
 410 #define SF_FL_SHIFT_EOL         (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF               0x0040
 418 #define SF_HAS_PAR              0x0080
 419 #define SF_IN_PAR               0x0100
 420 #define SF_HAS_EVAL             0x0200
 421 #define SCF_DO_SUBSTR           0x0400
 422 #define SCF_DO_STCLASS_AND      0x0800
 423 #define SCF_DO_STCLASS_OR       0x1000
 424 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS  0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436                                                      == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439                                                      >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS          -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE             0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480                 UTF8fARG(UTF, offset, RExC_precomp), \
 481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {                                        \
 489     const char *ellipses = "";                                          \
 490     IV len = RExC_end - RExC_precomp;                                   \
 491                                                                         \
 492     if (!SIZE_ONLY)                                                     \
 493         SAVEFREESV(RExC_rx_sv);                                         \
 494     if (len > RegexLengthToShowInErrorMessages) {                       \
 495         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 496         len = RegexLengthToShowInErrorMessages - 10;                    \
 497         ellipses = "...";                                               \
 498     }                                                                   \
 499     code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(                            \
 503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 504             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(                       \
 507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 508             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {                                    \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 516             m, REPORT_LOCATION_ARGS(offset));   \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {                           \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL(m);                                    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 532     const IV offset = RExC_parse - RExC_precomp;                        \
 533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 534                       REPORT_LOCATION_ARGS(offset));    \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {                       \
 541     if (!SIZE_ONLY)                                     \
 542         SAVEFREESV(RExC_rx_sv);                         \
 543     Simple_vFAIL2(m, a1);                               \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 553             REPORT_LOCATION_ARGS(offset));      \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {                    \
 560     if (!SIZE_ONLY)                                     \
 561         SAVEFREESV(RExC_rx_sv);                         \
 562     Simple_vFAIL3(m, a1, a2);                           \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 569     const IV offset = RExC_parse - RExC_precomp;                \
 570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 571             REPORT_LOCATION_ARGS(offset));      \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL4(m, a1, a2, a3);                       \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582     const IV offset = RExC_parse - RExC_precomp;   \
 583     if (!SIZE_ONLY)                                \
 584         SAVEFREESV(RExC_rx_sv);                    \
 585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586             REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594             m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {                                   \
 598     const IV offset = loc - RExC_precomp;                               \
 599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 600             REPORT_LOCATION_ARGS(offset));              \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {                                  \
 604     const IV offset = loc - RExC_precomp;                               \
 605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 606             REPORT_LOCATION_ARGS(offset));              \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {                                   \
 610     const IV offset = loc - RExC_precomp;                               \
 611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 612             m REPORT_LOCATION,                                          \
 613             REPORT_LOCATION_ARGS(offset));              \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {                                \
 617     const IV offset = loc - RExC_precomp;                               \
 618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 619             m REPORT_LOCATION,                                          \
 620             REPORT_LOCATION_ARGS(offset));              \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 626             m REPORT_LOCATION,                                          \
 627             a1, REPORT_LOCATION_ARGS(offset));  \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 631     const IV offset = loc - RExC_precomp;                               \
 632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 633             a1, REPORT_LOCATION_ARGS(offset));  \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 637     const IV offset = loc - RExC_precomp;                               \
 638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 639             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 643     const IV offset = loc - RExC_precomp;                               \
 644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 645             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 649     const IV offset = loc - RExC_precomp;                               \
 650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 651             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 655     const IV offset = loc - RExC_precomp;                               \
 656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 657             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 661     const IV offset = loc - RExC_precomp;                               \
 662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 663             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {                  \
 669     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 694     if (! SIZE_ONLY) {                                                  \
 695         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 696                     __LINE__, (int)(node), (int)(byte)));               \
 697         if((node) < 0) {                                                \
 698             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699                                          (int)(node));                  \
 700         } else {                                                        \
 701             RExC_offsets[2*(node)-1] = (byte);                          \
 702         }                                                               \
 703     }                                                                   \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 711     if (! SIZE_ONLY) {                                                  \
 712         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 713                 __LINE__, (int)(node), (int)(len)));                    \
 714         if((node) < 0) {                                                \
 715             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716                                          (int)(node));                  \
 717         } else {                                                        \
 718             RExC_offsets[2*(node)] = (len);                             \
 719         }                                                               \
 720     }                                                                   \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726     Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743         DEBUG_OPTIMISE_MORE_r({                                             \
 744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                                                                             \
 746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                                                                             \
 749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                                                                             \
 752             if (RExC_seen & REG_GPOS_SEEN)                                  \
 753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                                                                             \
 755             if (RExC_seen & REG_CANY_SEEN)                                  \
 756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                                                                             \
 758             if (RExC_seen & REG_RECURSE_SEEN)                               \
 759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                                                                             \
 761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                                                                             \
 764             if (RExC_seen & REG_VERBARG_SEEN)                               \
 765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                                                                             \
 767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                                                                             \
 770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                                                                             \
 773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                                                                             \
 776             if (RExC_seen & REG_GOSTART_SEEN)                               \
 777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                                                                             \
 779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                                                                             \
 782             PerlIO_printf(Perl_debug_log,"\n");                             \
 783         });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787     PerlIO_printf(Perl_debug_log,                                    \
 788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790         (int)(depth)*2, "",                                          \
 791         (IV)((data)->pos_min),                                       \
 792         (IV)((data)->pos_delta),                                     \
 793         (UV)((data)->flags),                                         \
 794         (IV)((data)->whilem_c),                                      \
 795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796         is_inf ? "INF " : ""                                         \
 797     );                                                               \
 798     if ((data)->last_found)                                          \
 799         PerlIO_printf(Perl_debug_log,                                \
 800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802             SvPVX_const((data)->last_found),                         \
 803             (IV)((data)->last_end),                                  \
 804             (IV)((data)->last_start_min),                            \
 805             (IV)((data)->last_start_max),                            \
 806             ((data)->longest &&                                      \
 807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808             SvPVX_const((data)->longest_fixed),                      \
 809             (IV)((data)->offset_fixed),                              \
 810             ((data)->longest &&                                      \
 811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812             SvPVX_const((data)->longest_float),                      \
 813             (IV)((data)->offset_float_min),                          \
 814             (IV)((data)->offset_float_max)                           \
 815         );                                                           \
 816     PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825                     SSize_t *minlenp, int is_inf)
 826 {
 827     const STRLEN l = CHR_SVLEN(data->last_found);
 828     const STRLEN old_l = CHR_SVLEN(*data->longest);
 829     GET_RE_DEBUG_FLAGS_DECL;
 830
 831     PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834         SvSetMagicSV(*data->longest, data->last_found);
 835         if (*data->longest == data->longest_fixed) {
 836             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837             if (data->flags & SF_BEFORE_EOL)
 838                 data->flags
 839                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840             else
 841                 data->flags &= ~SF_FIX_BEFORE_EOL;
 842             data->minlen_fixed=minlenp;
 843             data->lookbehind_fixed=0;
 844         }
 845         else { /* *data->longest == data->longest_float */
 846             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847             data->offset_float_max = (l
 848                                       ? data->last_start_max
 849                                       : (data->pos_delta == SSize_t_MAX
 850                                          ? SSize_t_MAX
 851                                          : data->pos_min + data->pos_delta));
 852             if (is_inf
 853                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854                 data->offset_float_max = SSize_t_MAX;
 855             if (data->flags & SF_BEFORE_EOL)
 856                 data->flags
 857                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858             else
 859                 data->flags &= ~SF_FL_BEFORE_EOL;
 860             data->minlen_float=minlenp;
 861             data->lookbehind_float=0;
 862         }
 863     }
 864     SvCUR_set(data->last_found, 0);
 865     {
 866         SV * const sv = data->last_found;
 867         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869             if (mg)
 870                 mg->mg_len = 0;
 871         }
 872     }
 873     data->last_end = -1;
 874     data->flags &= ~SF_BEFORE_EOL;
 875     DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884     /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886     PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901      * in any way, so there's no point in using it */
 902
 903     UV start, end;
 904     bool ret;
 905
 906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908     assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911         return FALSE;
 912     }
 913
 914     /* See if the list consists solely of the range 0 - Infinity */
 915     invlist_iterinit(ssc->invlist);
 916     ret = invlist_iternext(ssc->invlist, &start, &end)
 917           && start == 0
 918           && end == UV_MAX;
 919
 920     invlist_iterfinish(ssc->invlist);
 921
 922     if (ret) {
 923         return TRUE;
 924     }
 925
 926     /* If e.g., both \w and \W are set, matches everything */
 927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928         int i;
 929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931                 return TRUE;
 932             }
 933         }
 934     }
 935
 936     return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943      * string, any code point, or any posix class under locale */
 944
 945     PERL_ARGS_ASSERT_SSC_INIT;
 946
 947     Zero(ssc, 1, regnode_ssc);
 948     set_ANYOF_SYNTHETIC(ssc);
 949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950     ssc_anything(ssc);
 951
 952     /* If any portion of the regex is to operate under locale rules,
 953      * initialization includes it.  The reason this isn't done for all regexes
 954      * is that the optimizer was written under the assumption that locale was
 955      * all-or-nothing.  Given the complexity and lack of documentation in the
 956      * optimizer, and that there are inadequate test cases for locale, many
 957      * parts of it may not work properly, it is safest to avoid locale unless
 958      * necessary. */
 959     if (RExC_contains_locale) {
 960         ANYOF_POSIXL_SETALL(ssc);
 961     }
 962     else {
 963         ANYOF_POSIXL_ZERO(ssc);
 964     }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969                               const regnode_ssc *ssc)
 970 {
 971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972      * to the list of code points matched, and locale posix classes; hence does
 973      * not check its flags) */
 974
 975     UV start, end;
 976     bool ret;
 977
 978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980     assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982     invlist_iterinit(ssc->invlist);
 983     ret = invlist_iternext(ssc->invlist, &start, &end)
 984           && start == 0
 985           && end == UV_MAX;
 986
 987     invlist_iterfinish(ssc->invlist);
 988
 989     if (! ret) {
 990         return FALSE;
 991     }
 992
 993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994         return FALSE;
 995     }
 996
 997     return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102
1103     return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero           ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125
1126     PERL_ARGS_ASSERT_SSC_AND;
1127
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294
1295     PERL_ARGS_ASSERT_SSC_OR;
1296
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399     SV* cp_list = _new_invlist(2);
1400
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431
1432     SV* invlist = invlist_clone(ssc->invlist);
1433
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489             AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504         SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509                             PL_colors[0], PL_colors[1],
1510                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511                             PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524         const U32 base = trie->states[ state ].trans.base;
1525
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538         if ( base ) {
1539             U32 ofs = 0;
1540
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564
1565             PerlIO_printf( Perl_debug_log, "]");
1566
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574             (int)word, (int)(trie->wordinfo[word].prev),
1575             (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588                          U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615             SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617             if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647                           U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716                May be the same as startbranch
1717   last       : Thing following the last branch.
1718                May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795           <ac>
1796           <ad>
1797           <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817         <foo>
1818         <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827     *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832         if (UTF) {                                                         \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1837             SvPOK_on(zlopp);                                               \
1838             SvUTF8_on(zlopp);                                              \
1839             av_push(revcharmap, zlopp);                                    \
1840         } else {                                                           \
1841             char ooooff = (char)val;                                           \
1842             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1843         }                                                                  \
1844         } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884         4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924         trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1933      ( ( base + charid >=  ucharcount                                   \
1934          && base + charid < ubound                                      \
1935          && state == trie->trans[ base - ucharcount + charid ].check    \
1936          && trie->trans[ base - ucharcount + charid ].next )            \
1937            ? trie->trans[ base - ucharcount + charid ].next             \
1938            : ( state==1 ? special : 0 )                                 \
1939       )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     assert(re_trie_maxbuff);
2011     if (!SvIOK(re_trie_maxbuff)) {
2012         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2013     }
2014     DEBUG_TRIE_COMPILE_r({
2015         PerlIO_printf( Perl_debug_log,
2016           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2017           (int)depth * 2 + 2, "",
2018           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2019           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2020     });
2021
2022    /* Find the node we are going to overwrite */
2023     if ( first == startbranch && OP( last ) != BRANCH ) {
2024         /* whole branch chain */
2025         convert = first;
2026     } else {
2027         /* branch sub-chain */
2028         convert = NEXTOPER( first );
2029     }
2030
2031     /*  -- First loop and Setup --
2032
2033        We first traverse the branches and scan each word to determine if it
2034        contains widechars, and how many unique chars there are, this is
2035        important as we have to build a table with at least as many columns as we
2036        have unique chars.
2037
2038        We use an array of integers to represent the character codes 0..255
2039        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2040        the native representation of the character value as the key and IV's for
2041        the coded index.
2042
2043        *TODO* If we keep track of how many times each character is used we can
2044        remap the columns so that the table compression later on is more
2045        efficient in terms of memory by ensuring the most common value is in the
2046        middle and the least common are on the outside.  IMO this would be better
2047        than a most to least common mapping as theres a decent chance the most
2048        common letter will share a node with the least common, meaning the node
2049        will not be compressible. With a middle is most common approach the worst
2050        case is when we have the least common nodes twice.
2051
2052      */
2053
2054     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2055         regnode *noper = NEXTOPER( cur );
2056         const U8 *uc = (U8*)STRING( noper );
2057         const U8 *e  = uc + STR_LEN( noper );
2058         int foldlen = 0;
2059         U32 wordlen      = 0;         /* required init */
2060         STRLEN minchars = 0;
2061         STRLEN maxchars = 0;
2062         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2063                                                bitmap?*/
2064
2065         if (OP(noper) == NOTHING) {
2066             regnode *noper_next= regnext(noper);
2067             if (noper_next != tail && OP(noper_next) == flags) {
2068                 noper = noper_next;
2069                 uc= (U8*)STRING(noper);
2070                 e= uc + STR_LEN(noper);
2071                 trie->minlen= STR_LEN(noper);
2072             } else {
2073                 trie->minlen= 0;
2074                 continue;
2075             }
2076         }
2077
2078         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2079             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2080                                           regardless of encoding */
2081             if (OP( noper ) == EXACTFU_SS) {
2082                 /* false positives are ok, so just set this */
2083                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2084             }
2085         }
2086         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2087                                            branch */
2088             TRIE_CHARCOUNT(trie)++;
2089             TRIE_READ_CHAR;
2090
2091             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2092              * is in effect.  Under /i, this character can match itself, or
2093              * anything that folds to it.  If not under /i, it can match just
2094              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2095              * all fold to k, and all are single characters.   But some folds
2096              * expand to more than one character, so for example LATIN SMALL
2097              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2098              * the string beginning at 'uc' is 'ffi', it could be matched by
2099              * three characters, or just by the one ligature character. (It
2100              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2101              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2102              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2103              * match.)  The trie needs to know the minimum and maximum number
2104              * of characters that could match so that it can use size alone to
2105              * quickly reject many match attempts.  The max is simple: it is
2106              * the number of folded characters in this branch (since a fold is
2107              * never shorter than what folds to it. */
2108
2109             maxchars++;
2110
2111             /* And the min is equal to the max if not under /i (indicated by
2112              * 'folder' being NULL), or there are no multi-character folds.  If
2113              * there is a multi-character fold, the min is incremented just
2114              * once, for the character that folds to the sequence.  Each
2115              * character in the sequence needs to be added to the list below of
2116              * characters in the trie, but we count only the first towards the
2117              * min number of characters needed.  This is done through the
2118              * variable 'foldlen', which is returned by the macros that look
2119              * for these sequences as the number of bytes the sequence
2120              * occupies.  Each time through the loop, we decrement 'foldlen' by
2121              * how many bytes the current char occupies.  Only when it reaches
2122              * 0 do we increment 'minchars' or look for another multi-character
2123              * sequence. */
2124             if (folder == NULL) {
2125                 minchars++;
2126             }
2127             else if (foldlen > 0) {
2128                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2129             }
2130             else {
2131                 minchars++;
2132
2133                 /* See if *uc is the beginning of a multi-character fold.  If
2134                  * so, we decrement the length remaining to look at, to account
2135                  * for the current character this iteration.  (We can use 'uc'
2136                  * instead of the fold returned by TRIE_READ_CHAR because for
2137                  * non-UTF, the latin1_safe macro is smart enough to account
2138                  * for all the unfolded characters, and because for UTF, the
2139                  * string will already have been folded earlier in the
2140                  * compilation process */
2141                 if (UTF) {
2142                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2143                         foldlen -= UTF8SKIP(uc);
2144                     }
2145                 }
2146                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2147                     foldlen--;
2148                 }
2149             }
2150
2151             /* The current character (and any potential folds) should be added
2152              * to the possible matching characters for this position in this
2153              * branch */
2154             if ( uvc < 256 ) {
2155                 if ( folder ) {
2156                     U8 folded= folder[ (U8) uvc ];
2157                     if ( !trie->charmap[ folded ] ) {
2158                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2159                         TRIE_STORE_REVCHAR( folded );
2160                     }
2161                 }
2162                 if ( !trie->charmap[ uvc ] ) {
2163                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2164                     TRIE_STORE_REVCHAR( uvc );
2165                 }
2166                 if ( set_bit ) {
2167                     /* store the codepoint in the bitmap, and its folded
2168                      * equivalent. */
2169                     TRIE_BITMAP_SET(trie, uvc);
2170
2171                     /* store the folded codepoint */
2172                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2173
2174                     if ( !UTF ) {
2175                         /* store first byte of utf8 representation of
2176                            variant codepoints */
2177                         if (! UVCHR_IS_INVARIANT(uvc)) {
2178                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2179                         }
2180                     }
2181                     set_bit = 0; /* We've done our bit :-) */
2182                 }
2183             } else {
2184
2185                 /* XXX We could come up with the list of code points that fold
2186                  * to this using PL_utf8_foldclosures, except not for
2187                  * multi-char folds, as there may be multiple combinations
2188                  * there that could work, which needs to wait until runtime to
2189                  * resolve (The comment about LIGATURE FFI above is such an
2190                  * example */
2191
2192                 SV** svpp;
2193                 if ( !widecharmap )
2194                     widecharmap = newHV();
2195
2196                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2197
2198                 if ( !svpp )
2199                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2200
2201                 if ( !SvTRUE( *svpp ) ) {
2202                     sv_setiv( *svpp, ++trie->uniquecharcount );
2203                     TRIE_STORE_REVCHAR(uvc);
2204                 }
2205             }
2206         } /* end loop through characters in this branch of the trie */
2207
2208         /* We take the min and max for this branch and combine to find the min
2209          * and max for all branches processed so far */
2210         if( cur == first ) {
2211             trie->minlen = minchars;
2212             trie->maxlen = maxchars;
2213         } else if (minchars < trie->minlen) {
2214             trie->minlen = minchars;
2215         } else if (maxchars > trie->maxlen) {
2216             trie->maxlen = maxchars;
2217         }
2218     } /* end first pass */
2219     DEBUG_TRIE_COMPILE_r(
2220         PerlIO_printf( Perl_debug_log,
2221                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2222                 (int)depth * 2 + 2,"",
2223                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2224                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2225                 (int)trie->minlen, (int)trie->maxlen )
2226     );
2227
2228     /*
2229         We now know what we are dealing with in terms of unique chars and
2230         string sizes so we can calculate how much memory a naive
2231         representation using a flat table  will take. If it's over a reasonable
2232         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2233         conservative but potentially much slower representation using an array
2234         of lists.
2235
2236         At the end we convert both representations into the same compressed
2237         form that will be used in regexec.c for matching with. The latter
2238         is a form that cannot be used to construct with but has memory
2239         properties similar to the list form and access properties similar
2240         to the table form making it both suitable for fast searches and
2241         small enough that its feasable to store for the duration of a program.
2242
2243         See the comment in the code where the compressed table is produced
2244         inplace from the flat tabe representation for an explanation of how
2245         the compression works.
2246
2247     */
2248
2249
2250     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2251     prev_states[1] = 0;
2252
2253     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2254                                                     > SvIV(re_trie_maxbuff) )
2255     {
2256         /*
2257             Second Pass -- Array Of Lists Representation
2258
2259             Each state will be represented by a list of charid:state records
2260             (reg_trie_trans_le) the first such element holds the CUR and LEN
2261             points of the allocated array. (See defines above).
2262
2263             We build the initial structure using the lists, and then convert
2264             it into the compressed table form which allows faster lookups
2265             (but cant be modified once converted).
2266         */
2267
2268         STRLEN transcount = 1;
2269
2270         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2271             "%*sCompiling trie using list compiler\n",
2272             (int)depth * 2 + 2, ""));
2273
2274         trie->states = (reg_trie_state *)
2275             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2276                                   sizeof(reg_trie_state) );
2277         TRIE_LIST_NEW(1);
2278         next_alloc = 2;
2279
2280         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2281
2282             regnode *noper   = NEXTOPER( cur );
2283             U8 *uc           = (U8*)STRING( noper );
2284             const U8 *e      = uc + STR_LEN( noper );
2285             U32 state        = 1;         /* required init */
2286             U16 charid       = 0;         /* sanity init */
2287             U32 wordlen      = 0;         /* required init */
2288
2289             if (OP(noper) == NOTHING) {
2290                 regnode *noper_next= regnext(noper);
2291                 if (noper_next != tail && OP(noper_next) == flags) {
2292                     noper = noper_next;
2293                     uc= (U8*)STRING(noper);
2294                     e= uc + STR_LEN(noper);
2295                 }
2296             }
2297
2298             if (OP(noper) != NOTHING) {
2299                 for ( ; uc < e ; uc += len ) {
2300
2301                     TRIE_READ_CHAR;
2302
2303                     if ( uvc < 256 ) {
2304                         charid = trie->charmap[ uvc ];
2305                     } else {
2306                         SV** const svpp = hv_fetch( widecharmap,
2307                                                     (char*)&uvc,
2308                                                     sizeof( UV ),
2309                                                     0);
2310                         if ( !svpp ) {
2311                             charid = 0;
2312                         } else {
2313                             charid=(U16)SvIV( *svpp );
2314                         }
2315                     }
2316                     /* charid is now 0 if we dont know the char read, or
2317                      * nonzero if we do */
2318                     if ( charid ) {
2319
2320                         U16 check;
2321                         U32 newstate = 0;
2322
2323                         charid--;
2324                         if ( !trie->states[ state ].trans.list ) {
2325                             TRIE_LIST_NEW( state );
2326                         }
2327                         for ( check = 1;
2328                               check <= TRIE_LIST_USED( state );
2329                               check++ )
2330                         {
2331                             if ( TRIE_LIST_ITEM( state, check ).forid
2332                                                                     == charid )
2333                             {
2334                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2335                                 break;
2336                             }
2337                         }
2338                         if ( ! newstate ) {
2339                             newstate = next_alloc++;
2340                             prev_states[newstate] = state;
2341                             TRIE_LIST_PUSH( state, charid, newstate );
2342                             transcount++;
2343                         }
2344                         state = newstate;
2345                     } else {
2346                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2347                     }
2348                 }
2349             }
2350             TRIE_HANDLE_WORD(state);
2351
2352         } /* end second pass */
2353
2354         /* next alloc is the NEXT state to be allocated */
2355         trie->statecount = next_alloc;
2356         trie->states = (reg_trie_state *)
2357             PerlMemShared_realloc( trie->states,
2358                                    next_alloc
2359                                    * sizeof(reg_trie_state) );
2360
2361         /* and now dump it out before we compress it */
2362         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2363                                                          revcharmap, next_alloc,
2364                                                          depth+1)
2365         );
2366
2367         trie->trans = (reg_trie_trans *)
2368             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2369         {
2370             U32 state;
2371             U32 tp = 0;
2372             U32 zp = 0;
2373
2374
2375             for( state=1 ; state < next_alloc ; state ++ ) {
2376                 U32 base=0;
2377
2378                 /*
2379                 DEBUG_TRIE_COMPILE_MORE_r(
2380                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2381                 );
2382                 */
2383
2384                 if (trie->states[state].trans.list) {
2385                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2386                     U16 maxid=minid;
2387                     U16 idx;
2388
2389                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2390                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2391                         if ( forid < minid ) {
2392                             minid=forid;
2393                         } else if ( forid > maxid ) {
2394                             maxid=forid;
2395                         }
2396                     }
2397                     if ( transcount < tp + maxid - minid + 1) {
2398                         transcount *= 2;
2399                         trie->trans = (reg_trie_trans *)
2400                             PerlMemShared_realloc( trie->trans,
2401                                                      transcount
2402                                                      * sizeof(reg_trie_trans) );
2403                         Zero( trie->trans + (transcount / 2),
2404                               transcount / 2,
2405                               reg_trie_trans );
2406                     }
2407                     base = trie->uniquecharcount + tp - minid;
2408                     if ( maxid == minid ) {
2409                         U32 set = 0;
2410                         for ( ; zp < tp ; zp++ ) {
2411                             if ( ! trie->trans[ zp ].next ) {
2412                                 base = trie->uniquecharcount + zp - minid;
2413                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2414                                                                    1).newstate;
2415                                 trie->trans[ zp ].check = state;
2416                                 set = 1;
2417                                 break;
2418                             }
2419                         }
2420                         if ( !set ) {
2421                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2422                                                                    1).newstate;
2423                             trie->trans[ tp ].check = state;
2424                             tp++;
2425                             zp = tp;
2426                         }
2427                     } else {
2428                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2429                             const U32 tid = base
2430                                            - trie->uniquecharcount
2431                                            + TRIE_LIST_ITEM( state, idx ).forid;
2432                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2433                                                                 idx ).newstate;
2434                             trie->trans[ tid ].check = state;
2435                         }
2436                         tp += ( maxid - minid + 1 );
2437                     }
2438                     Safefree(trie->states[ state ].trans.list);
2439                 }
2440                 /*
2441                 DEBUG_TRIE_COMPILE_MORE_r(
2442                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2443                 );
2444                 */
2445                 trie->states[ state ].trans.base=base;
2446             }
2447             trie->lasttrans = tp + 1;
2448         }
2449     } else {
2450         /*
2451            Second Pass -- Flat Table Representation.
2452
2453            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2454            each.  We know that we will need Charcount+1 trans at most to store
2455            the data (one row per char at worst case) So we preallocate both
2456            structures assuming worst case.
2457
2458            We then construct the trie using only the .next slots of the entry
2459            structs.
2460
2461            We use the .check field of the first entry of the node temporarily
2462            to make compression both faster and easier by keeping track of how
2463            many non zero fields are in the node.
2464
2465            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2466            transition.
2467
2468            There are two terms at use here: state as a TRIE_NODEIDX() which is
2469            a number representing the first entry of the node, and state as a
2470            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2471            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2472            if there are 2 entrys per node. eg:
2473
2474              A B       A B
2475           1. 2 4    1. 3 7
2476           2. 0 3    3. 0 5
2477           3. 0 0    5. 0 0
2478           4. 0 0    7. 0 0
2479
2480            The table is internally in the right hand, idx form. However as we
2481            also have to deal with the states array which is indexed by nodenum
2482            we have to use TRIE_NODENUM() to convert.
2483
2484         */
2485         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2486             "%*sCompiling trie using table compiler\n",
2487             (int)depth * 2 + 2, ""));
2488
2489         trie->trans = (reg_trie_trans *)
2490             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2491                                   * trie->uniquecharcount + 1,
2492                                   sizeof(reg_trie_trans) );
2493         trie->states = (reg_trie_state *)
2494             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2495                                   sizeof(reg_trie_state) );
2496         next_alloc = trie->uniquecharcount + 1;
2497
2498
2499         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2500
2501             regnode *noper   = NEXTOPER( cur );
2502             const U8 *uc     = (U8*)STRING( noper );
2503             const U8 *e      = uc + STR_LEN( noper );
2504
2505             U32 state        = 1;         /* required init */
2506
2507             U16 charid       = 0;         /* sanity init */
2508             U32 accept_state = 0;         /* sanity init */
2509
2510             U32 wordlen      = 0;         /* required init */
2511
2512             if (OP(noper) == NOTHING) {
2513                 regnode *noper_next= regnext(noper);
2514                 if (noper_next != tail && OP(noper_next) == flags) {
2515                     noper = noper_next;
2516                     uc= (U8*)STRING(noper);
2517                     e= uc + STR_LEN(noper);
2518                 }
2519             }
2520
2521             if ( OP(noper) != NOTHING ) {
2522                 for ( ; uc < e ; uc += len ) {
2523
2524                     TRIE_READ_CHAR;
2525
2526                     if ( uvc < 256 ) {
2527                         charid = trie->charmap[ uvc ];
2528                     } else {
2529                         SV* const * const svpp = hv_fetch( widecharmap,
2530                                                            (char*)&uvc,
2531                                                            sizeof( UV ),
2532                                                            0);
2533                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2534                     }
2535                     if ( charid ) {
2536                         charid--;
2537                         if ( !trie->trans[ state + charid ].next ) {
2538                             trie->trans[ state + charid ].next = next_alloc;
2539                             trie->trans[ state ].check++;
2540                             prev_states[TRIE_NODENUM(next_alloc)]
2541                                     = TRIE_NODENUM(state);
2542                             next_alloc += trie->uniquecharcount;
2543                         }
2544                         state = trie->trans[ state + charid ].next;
2545                     } else {
2546                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2547                     }
2548                     /* charid is now 0 if we dont know the char read, or
2549                      * nonzero if we do */
2550                 }
2551             }
2552             accept_state = TRIE_NODENUM( state );
2553             TRIE_HANDLE_WORD(accept_state);
2554
2555         } /* end second pass */
2556
2557         /* and now dump it out before we compress it */
2558         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2559                                                           revcharmap,
2560                                                           next_alloc, depth+1));
2561
2562         {
2563         /*
2564            * Inplace compress the table.*
2565
2566            For sparse data sets the table constructed by the trie algorithm will
2567            be mostly 0/FAIL transitions or to put it another way mostly empty.
2568            (Note that leaf nodes will not contain any transitions.)
2569
2570            This algorithm compresses the tables by eliminating most such
2571            transitions, at the cost of a modest bit of extra work during lookup:
2572
2573            - Each states[] entry contains a .base field which indicates the
2574            index in the state[] array wheres its transition data is stored.
2575
2576            - If .base is 0 there are no valid transitions from that node.
2577
2578            - If .base is nonzero then charid is added to it to find an entry in
2579            the trans array.
2580
2581            -If trans[states[state].base+charid].check!=state then the
2582            transition is taken to be a 0/Fail transition. Thus if there are fail
2583            transitions at the front of the node then the .base offset will point
2584            somewhere inside the previous nodes data (or maybe even into a node
2585            even earlier), but the .check field determines if the transition is
2586            valid.
2587
2588            XXX - wrong maybe?
2589            The following process inplace converts the table to the compressed
2590            table: We first do not compress the root node 1,and mark all its
2591            .check pointers as 1 and set its .base pointer as 1 as well. This
2592            allows us to do a DFA construction from the compressed table later,
2593            and ensures that any .base pointers we calculate later are greater
2594            than 0.
2595
2596            - We set 'pos' to indicate the first entry of the second node.
2597
2598            - We then iterate over the columns of the node, finding the first and
2599            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2600            and set the .check pointers accordingly, and advance pos
2601            appropriately and repreat for the next node. Note that when we copy
2602            the next pointers we have to convert them from the original
2603            NODEIDX form to NODENUM form as the former is not valid post
2604            compression.
2605
2606            - If a node has no transitions used we mark its base as 0 and do not
2607            advance the pos pointer.
2608
2609            - If a node only has one transition we use a second pointer into the
2610            structure to fill in allocated fail transitions from other states.
2611            This pointer is independent of the main pointer and scans forward
2612            looking for null transitions that are allocated to a state. When it
2613            finds one it writes the single transition into the "hole".  If the
2614            pointer doesnt find one the single transition is appended as normal.
2615
2616            - Once compressed we can Renew/realloc the structures to release the
2617            excess space.
2618
2619            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2620            specifically Fig 3.47 and the associated pseudocode.
2621
2622            demq
2623         */
2624         const U32 laststate = TRIE_NODENUM( next_alloc );
2625         U32 state, charid;
2626         U32 pos = 0, zp=0;
2627         trie->statecount = laststate;
2628
2629         for ( state = 1 ; state < laststate ; state++ ) {
2630             U8 flag = 0;
2631             const U32 stateidx = TRIE_NODEIDX( state );
2632             const U32 o_used = trie->trans[ stateidx ].check;
2633             U32 used = trie->trans[ stateidx ].check;
2634             trie->trans[ stateidx ].check = 0;
2635
2636             for ( charid = 0;
2637                   used && charid < trie->uniquecharcount;
2638                   charid++ )
2639             {
2640                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2641                     if ( trie->trans[ stateidx + charid ].next ) {
2642                         if (o_used == 1) {
2643                             for ( ; zp < pos ; zp++ ) {
2644                                 if ( ! trie->trans[ zp ].next ) {
2645                                     break;
2646                                 }
2647                             }
2648                             trie->states[ state ].trans.base
2649                                                     = zp
2650                                                       + trie->uniquecharcount
2651                                                       - charid ;
2652                             trie->trans[ zp ].next
2653                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2654                                                              + charid ].next );
2655                             trie->trans[ zp ].check = state;
2656                             if ( ++zp > pos ) pos = zp;
2657                             break;
2658                         }
2659                         used--;
2660                     }
2661                     if ( !flag ) {
2662                         flag = 1;
2663                         trie->states[ state ].trans.base
2664                                        = pos + trie->uniquecharcount - charid ;
2665                     }
2666                     trie->trans[ pos ].next
2667                         = SAFE_TRIE_NODENUM(
2668                                        trie->trans[ stateidx + charid ].next );
2669                     trie->trans[ pos ].check = state;
2670                     pos++;
2671                 }
2672             }
2673         }
2674         trie->lasttrans = pos + 1;
2675         trie->states = (reg_trie_state *)
2676             PerlMemShared_realloc( trie->states, laststate
2677                                    * sizeof(reg_trie_state) );
2678         DEBUG_TRIE_COMPILE_MORE_r(
2679             PerlIO_printf( Perl_debug_log,
2680                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2681                 (int)depth * 2 + 2,"",
2682                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2683                        + 1 ),
2684                 (IV)next_alloc,
2685                 (IV)pos,
2686                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2687             );
2688
2689         } /* end table compress */
2690     }
2691     DEBUG_TRIE_COMPILE_MORE_r(
2692             PerlIO_printf(Perl_debug_log,
2693                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2694                 (int)depth * 2 + 2, "",
2695                 (UV)trie->statecount,
2696                 (UV)trie->lasttrans)
2697     );
2698     /* resize the trans array to remove unused space */
2699     trie->trans = (reg_trie_trans *)
2700         PerlMemShared_realloc( trie->trans, trie->lasttrans
2701                                * sizeof(reg_trie_trans) );
2702
2703     {   /* Modify the program and insert the new TRIE node */
2704         U8 nodetype =(U8)(flags & 0xFF);
2705         char *str=NULL;
2706
2707 #ifdef DEBUGGING
2708         regnode *optimize = NULL;
2709 #ifdef RE_TRACK_PATTERN_OFFSETS
2710
2711         U32 mjd_offset = 0;
2712         U32 mjd_nodelen = 0;
2713 #endif /* RE_TRACK_PATTERN_OFFSETS */
2714 #endif /* DEBUGGING */
2715         /*
2716            This means we convert either the first branch or the first Exact,
2717            depending on whether the thing following (in 'last') is a branch
2718            or not and whther first is the startbranch (ie is it a sub part of
2719            the alternation or is it the whole thing.)
2720            Assuming its a sub part we convert the EXACT otherwise we convert
2721            the whole branch sequence, including the first.
2722          */
2723         /* Find the node we are going to overwrite */
2724         if ( first != startbranch || OP( last ) == BRANCH ) {
2725             /* branch sub-chain */
2726             NEXT_OFF( first ) = (U16)(last - first);
2727 #ifdef RE_TRACK_PATTERN_OFFSETS
2728             DEBUG_r({
2729                 mjd_offset= Node_Offset((convert));
2730                 mjd_nodelen= Node_Length((convert));
2731             });
2732 #endif
2733             /* whole branch chain */
2734         }
2735 #ifdef RE_TRACK_PATTERN_OFFSETS
2736         else {
2737             DEBUG_r({
2738                 const  regnode *nop = NEXTOPER( convert );
2739                 mjd_offset= Node_Offset((nop));
2740                 mjd_nodelen= Node_Length((nop));
2741             });
2742         }
2743         DEBUG_OPTIMISE_r(
2744             PerlIO_printf(Perl_debug_log,
2745                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2746                 (int)depth * 2 + 2, "",
2747                 (UV)mjd_offset, (UV)mjd_nodelen)
2748         );
2749 #endif
2750         /* But first we check to see if there is a common prefix we can
2751            split out as an EXACT and put in front of the TRIE node.  */
2752         trie->startstate= 1;
2753         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2754             U32 state;
2755             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2756                 U32 ofs = 0;
2757                 I32 idx = -1;
2758                 U32 count = 0;
2759                 const U32 base = trie->states[ state ].trans.base;
2760
2761                 if ( trie->states[state].wordnum )
2762                         count = 1;
2763
2764                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2765                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2766                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2767                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2768                     {
2769                         if ( ++count > 1 ) {
2770                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2771                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2772                             if ( state == 1 ) break;
2773                             if ( count == 2 ) {
2774                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2775                                 DEBUG_OPTIMISE_r(
2776                                     PerlIO_printf(Perl_debug_log,
2777                                         "%*sNew Start State=%"UVuf" Class: [",
2778                                         (int)depth * 2 + 2, "",
2779                                         (UV)state));
2780                                 if (idx >= 0) {
2781                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2782                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2783
2784                                     TRIE_BITMAP_SET(trie,*ch);
2785                                     if ( folder )
2786                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2787                                     DEBUG_OPTIMISE_r(
2788                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2789                                     );
2790                                 }
2791                             }
2792                             TRIE_BITMAP_SET(trie,*ch);
2793                             if ( folder )
2794                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2795                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2796                         }
2797                         idx = ofs;
2798                     }
2799                 }
2800                 if ( count == 1 ) {
2801                     SV **tmp = av_fetch( revcharmap, idx, 0);
2802                     STRLEN len;
2803                     char *ch = SvPV( *tmp, len );
2804                     DEBUG_OPTIMISE_r({
2805                         SV *sv=sv_newmortal();
2806                         PerlIO_printf( Perl_debug_log,
2807                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2808                             (int)depth * 2 + 2, "",
2809                             (UV)state, (UV)idx,
2810                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2811                                 PL_colors[0], PL_colors[1],
2812                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2813                                 PERL_PV_ESCAPE_FIRSTCHAR
2814                             )
2815                         );
2816                     });
2817                     if ( state==1 ) {
2818                         OP( convert ) = nodetype;
2819                         str=STRING(convert);
2820                         STR_LEN(convert)=0;
2821                     }
2822                     STR_LEN(convert) += len;
2823                     while (len--)
2824                         *str++ = *ch++;
2825                 } else {
2826 #ifdef DEBUGGING
2827                     if (state>1)
2828                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2829 #endif
2830                     break;
2831                 }
2832             }
2833             trie->prefixlen = (state-1);
2834             if (str) {
2835                 regnode *n = convert+NODE_SZ_STR(convert);
2836                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2837                 trie->startstate = state;
2838                 trie->minlen -= (state - 1);
2839                 trie->maxlen -= (state - 1);
2840 #ifdef DEBUGGING
2841                /* At least the UNICOS C compiler choked on this
2842                 * being argument to DEBUG_r(), so let's just have
2843                 * it right here. */
2844                if (
2845 #ifdef PERL_EXT_RE_BUILD
2846                    1
2847 #else
2848                    DEBUG_r_TEST
2849 #endif
2850                    ) {
2851                    regnode *fix = convert;
2852                    U32 word = trie->wordcount;
2853                    mjd_nodelen++;
2854                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2855                    while( ++fix < n ) {
2856                        Set_Node_Offset_Length(fix, 0, 0);
2857                    }
2858                    while (word--) {
2859                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2860                        if (tmp) {
2861                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2862                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2863                            else
2864                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2865                        }
2866                    }
2867                }
2868 #endif
2869                 if (trie->maxlen) {
2870                     convert = n;
2871                 } else {
2872                     NEXT_OFF(convert) = (U16)(tail - convert);
2873                     DEBUG_r(optimize= n);
2874                 }
2875             }
2876         }
2877         if (!jumper)
2878             jumper = last;
2879         if ( trie->maxlen ) {
2880             NEXT_OFF( convert ) = (U16)(tail - convert);
2881             ARG_SET( convert, data_slot );
2882             /* Store the offset to the first unabsorbed branch in
2883                jump[0], which is otherwise unused by the jump logic.
2884                We use this when dumping a trie and during optimisation. */
2885             if (trie->jump)
2886                 trie->jump[0] = (U16)(nextbranch - convert);
2887
2888             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2889              *   and there is a bitmap
2890              *   and the first "jump target" node we found leaves enough room
2891              * then convert the TRIE node into a TRIEC node, with the bitmap
2892              * embedded inline in the opcode - this is hypothetically faster.
2893              */
2894             if ( !trie->states[trie->startstate].wordnum
2895                  && trie->bitmap
2896                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2897             {
2898                 OP( convert ) = TRIEC;
2899                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2900                 PerlMemShared_free(trie->bitmap);
2901                 trie->bitmap= NULL;
2902             } else
2903                 OP( convert ) = TRIE;
2904
2905             /* store the type in the flags */
2906             convert->flags = nodetype;
2907             DEBUG_r({
2908             optimize = convert
2909                       + NODE_STEP_REGNODE
2910                       + regarglen[ OP( convert ) ];
2911             });
2912             /* XXX We really should free up the resource in trie now,
2913                    as we won't use them - (which resources?) dmq */
2914         }
2915         /* needed for dumping*/
2916         DEBUG_r(if (optimize) {
2917             regnode *opt = convert;
2918
2919             while ( ++opt < optimize) {
2920                 Set_Node_Offset_Length(opt,0,0);
2921             }
2922             /*
2923                 Try to clean up some of the debris left after the
2924                 optimisation.
2925              */
2926             while( optimize < jumper ) {
2927                 mjd_nodelen += Node_Length((optimize));
2928                 OP( optimize ) = OPTIMIZED;
2929                 Set_Node_Offset_Length(optimize,0,0);
2930                 optimize++;
2931             }
2932             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2933         });
2934     } /* end node insert */
2935
2936     /*  Finish populating the prev field of the wordinfo array.  Walk back
2937      *  from each accept state until we find another accept state, and if
2938      *  so, point the first word's .prev field at the second word. If the
2939      *  second already has a .prev field set, stop now. This will be the
2940      *  case either if we've already processed that word's accept state,
2941      *  or that state had multiple words, and the overspill words were
2942      *  already linked up earlier.
2943      */
2944     {
2945         U16 word;
2946         U32 state;
2947         U16 prev;
2948
2949         for (word=1; word <= trie->wordcount; word++) {
2950             prev = 0;
2951             if (trie->wordinfo[word].prev)
2952                 continue;
2953             state = trie->wordinfo[word].accept;
2954             while (state) {
2955                 state = prev_states[state];
2956                 if (!state)
2957                     break;
2958                 prev = trie->states[state].wordnum;
2959                 if (prev)
2960                     break;
2961             }
2962             trie->wordinfo[word].prev = prev;
2963         }
2964         Safefree(prev_states);
2965     }
2966
2967
2968     /* and now dump out the compressed format */
2969     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2970
2971     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2972 #ifdef DEBUGGING
2973     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2974     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2975 #else
2976     SvREFCNT_dec_NN(revcharmap);
2977 #endif
2978     return trie->jump
2979            ? MADE_JUMP_TRIE
2980            : trie->startstate>1
2981              ? MADE_EXACT_TRIE
2982              : MADE_TRIE;
2983 }
2984
2985 STATIC void
2986 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2987 {
2988 /* The Trie is constructed and compressed now so we can build a fail array if
2989  * it's needed
2990
2991    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2992    3.32 in the
2993    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2994    Ullman 1985/88
2995    ISBN 0-201-10088-6
2996
2997    We find the fail state for each state in the trie, this state is the longest
2998    proper suffix of the current state's 'word' that is also a proper prefix of
2999    another word in our trie. State 1 represents the word '' and is thus the
3000    default fail state. This allows the DFA not to have to restart after its
3001    tried and failed a word at a given point, it simply continues as though it
3002    had been matching the other word in the first place.
3003    Consider
3004       'abcdgu'=~/abcdefg|cdgu/
3005    When we get to 'd' we are still matching the first word, we would encounter
3006    'g' which would fail, which would bring us to the state representing 'd' in
3007    the second word where we would try 'g' and succeed, proceeding to match
3008    'cdgu'.
3009  */
3010  /* add a fail transition */
3011     const U32 trie_offset = ARG(source);
3012     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3013     U32 *q;
3014     const U32 ucharcount = trie->uniquecharcount;
3015     const U32 numstates = trie->statecount;
3016     const U32 ubound = trie->lasttrans + ucharcount;
3017     U32 q_read = 0;
3018     U32 q_write = 0;
3019     U32 charid;
3020     U32 base = trie->states[ 1 ].trans.base;
3021     U32 *fail;
3022     reg_ac_data *aho;
3023     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3024     GET_RE_DEBUG_FLAGS_DECL;
3025
3026     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3027 #ifndef DEBUGGING
3028     PERL_UNUSED_ARG(depth);
3029 #endif
3030
3031
3032     ARG_SET( stclass, data_slot );
3033     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3034     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3035     aho->trie=trie_offset;
3036     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3037     Copy( trie->states, aho->states, numstates, reg_trie_state );
3038     Newxz( q, numstates, U32);
3039     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3040     aho->refcount = 1;
3041     fail = aho->fail;
3042     /* initialize fail[0..1] to be 1 so that we always have
3043        a valid final fail state */
3044     fail[ 0 ] = fail[ 1 ] = 1;
3045
3046     for ( charid = 0; charid < ucharcount ; charid++ ) {
3047         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3048         if ( newstate ) {
3049             q[ q_write ] = newstate;
3050             /* set to point at the root */
3051             fail[ q[ q_write++ ] ]=1;
3052         }
3053     }
3054     while ( q_read < q_write) {
3055         const U32 cur = q[ q_read++ % numstates ];
3056         base = trie->states[ cur ].trans.base;
3057
3058         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3059             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3060             if (ch_state) {
3061                 U32 fail_state = cur;
3062                 U32 fail_base;
3063                 do {
3064                     fail_state = fail[ fail_state ];
3065                     fail_base = aho->states[ fail_state ].trans.base;
3066                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3067
3068                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3069                 fail[ ch_state ] = fail_state;
3070                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3071                 {
3072                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3073                 }
3074                 q[ q_write++ % numstates] = ch_state;
3075             }
3076         }
3077     }
3078     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3079        when we fail in state 1, this allows us to use the
3080        charclass scan to find a valid start char. This is based on the principle
3081        that theres a good chance the string being searched contains lots of stuff
3082        that cant be a start char.
3083      */
3084     fail[ 0 ] = fail[ 1 ] = 0;
3085     DEBUG_TRIE_COMPILE_r({
3086         PerlIO_printf(Perl_debug_log,
3087                       "%*sStclass Failtable (%"UVuf" states): 0",
3088                       (int)(depth * 2), "", (UV)numstates
3089         );
3090         for( q_read=1; q_read<numstates; q_read++ ) {
3091             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3092         }
3093         PerlIO_printf(Perl_debug_log, "\n");
3094     });
3095     Safefree(q);
3096     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3097 }
3098
3099
3100 #define DEBUG_PEEP(str,scan,depth) \
3101     DEBUG_OPTIMISE_r({if (scan){ \
3102        SV * const mysv=sv_newmortal(); \
3103        regnode *Next = regnext(scan); \
3104        regprop(RExC_rx, mysv, scan, NULL); \
3105        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3106        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3107        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3108    }});
3109
3110
3111 /* The below joins as many adjacent EXACTish nodes as possible into a single
3112  * one.  The regop may be changed if the node(s) contain certain sequences that
3113  * require special handling.  The joining is only done if:
3114  * 1) there is room in the current conglomerated node to entirely contain the
3115  *    next one.
3116  * 2) they are the exact same node type
3117  *
3118  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3119  * these get optimized out
3120  *
3121  * If a node is to match under /i (folded), the number of characters it matches
3122  * can be different than its character length if it contains a multi-character
3123  * fold.  *min_subtract is set to the total delta number of characters of the
3124  * input nodes.
3125  *
3126  * And *unfolded_multi_char is set to indicate whether or not the node contains
3127  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3128  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3129  * SMALL LETTER SHARP S, as only if the target string being matched against
3130  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3131  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3132  * whose components are all above the Latin1 range are not run-time locale
3133  * dependent, and have already been folded by the time this function is
3134  * called.)
3135  *
3136  * This is as good a place as any to discuss the design of handling these
3137  * multi-character fold sequences.  It's been wrong in Perl for a very long
3138  * time.  There are three code points in Unicode whose multi-character folds
3139  * were long ago discovered to mess things up.  The previous designs for
3140  * dealing with these involved assigning a special node for them.  This
3141  * approach doesn't always work, as evidenced by this example:
3142  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3143  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3144  * would match just the \xDF, it won't be able to handle the case where a
3145  * successful match would have to cross the node's boundary.  The new approach
3146  * that hopefully generally solves the problem generates an EXACTFU_SS node
3147  * that is "sss" in this case.
3148  *
3149  * It turns out that there are problems with all multi-character folds, and not
3150  * just these three.  Now the code is general, for all such cases.  The
3151  * approach taken is:
3152  * 1)   This routine examines each EXACTFish node that could contain multi-
3153  *      character folded sequences.  Since a single character can fold into
3154  *      such a sequence, the minimum match length for this node is less than
3155  *      the number of characters in the node.  This routine returns in
3156  *      *min_subtract how many characters to subtract from the the actual
3157  *      length of the string to get a real minimum match length; it is 0 if
3158  *      there are no multi-char foldeds.  This delta is used by the caller to
3159  *      adjust the min length of the match, and the delta between min and max,
3160  *      so that the optimizer doesn't reject these possibilities based on size
3161  *      constraints.
3162  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3163  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3164  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3165  *      there is a possible fold length change.  That means that a regular
3166  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3167  *      with length changes, and so can be processed faster.  regexec.c takes
3168  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3169  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3170  *      known until runtime).  This saves effort in regex matching.  However,
3171  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3172  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3173  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3174  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3175  *      possibilities for the non-UTF8 patterns are quite simple, except for
3176  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3177  *      members of a fold-pair, and arrays are set up for all of them so that
3178  *      the other member of the pair can be found quickly.  Code elsewhere in
3179  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3180  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3181  *      described in the next item.
3182  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3183  *      validity of the fold won't be known until runtime, and so must remain
3184  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3185  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3186  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3187  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3188  *      The reason this is a problem is that the optimizer part of regexec.c
3189  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3190  *      that a character in the pattern corresponds to at most a single
3191  *      character in the target string.  (And I do mean character, and not byte
3192  *      here, unlike other parts of the documentation that have never been
3193  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3194  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3195  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3196  *      nodes, violate the assumption, and they are the only instances where it
3197  *      is violated.  I'm reluctant to try to change the assumption, as the
3198  *      code involved is impenetrable to me (khw), so instead the code here
3199  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3200  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3201  *      boolean indicating whether or not the node contains such a fold.  When
3202  *      it is true, the caller sets a flag that later causes the optimizer in
3203  *      this file to not set values for the floating and fixed string lengths,
3204  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3205  *      assumption.  Thus, there is no optimization based on string lengths for
3206  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3207  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3208  *      assumption is wrong only in these cases is that all other non-UTF-8
3209  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3210  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3211  *      EXACTF nodes because we don't know at compile time if it actually
3212  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3213  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3214  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3215  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3216  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3217  *      string would require the pattern to be forced into UTF-8, the overhead
3218  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3219  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3220  *      locale.)
3221  *
3222  *      Similarly, the code that generates tries doesn't currently handle
3223  *      not-already-folded multi-char folds, and it looks like a pain to change
3224  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3225  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3226  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3227  *      using /iaa matching will be doing so almost entirely with ASCII
3228  *      strings, so this should rarely be encountered in practice */
3229
3230 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3231     if (PL_regkind[OP(scan)] == EXACT) \
3232         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3233
3234 STATIC U32
3235 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3236                    UV *min_subtract, bool *unfolded_multi_char,
3237                    U32 flags,regnode *val, U32 depth)
3238 {
3239     /* Merge several consecutive EXACTish nodes into one. */
3240     regnode *n = regnext(scan);
3241     U32 stringok = 1;
3242     regnode *next = scan + NODE_SZ_STR(scan);
3243     U32 merged = 0;
3244     U32 stopnow = 0;
3245 #ifdef DEBUGGING
3246     regnode *stop = scan;
3247     GET_RE_DEBUG_FLAGS_DECL;
3248 #else
3249     PERL_UNUSED_ARG(depth);
3250 #endif
3251
3252     PERL_ARGS_ASSERT_JOIN_EXACT;
3253 #ifndef EXPERIMENTAL_INPLACESCAN
3254     PERL_UNUSED_ARG(flags);
3255     PERL_UNUSED_ARG(val);
3256 #endif
3257     DEBUG_PEEP("join",scan,depth);
3258
3259     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3260      * EXACT ones that are mergeable to the current one. */
3261     while (n
3262            && (PL_regkind[OP(n)] == NOTHING
3263                || (stringok && OP(n) == OP(scan)))
3264            && NEXT_OFF(n)
3265            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3266     {
3267
3268         if (OP(n) == TAIL || n > next)
3269             stringok = 0;
3270         if (PL_regkind[OP(n)] == NOTHING) {
3271             DEBUG_PEEP("skip:",n,depth);
3272             NEXT_OFF(scan) += NEXT_OFF(n);
3273             next = n + NODE_STEP_REGNODE;
3274 #ifdef DEBUGGING
3275             if (stringok)
3276                 stop = n;
3277 #endif
3278             n = regnext(n);
3279         }
3280         else if (stringok) {
3281             const unsigned int oldl = STR_LEN(scan);
3282             regnode * const nnext = regnext(n);
3283
3284             /* XXX I (khw) kind of doubt that this works on platforms (should
3285              * Perl ever run on one) where U8_MAX is above 255 because of lots
3286              * of other assumptions */
3287             /* Don't join if the sum can't fit into a single node */
3288             if (oldl + STR_LEN(n) > U8_MAX)
3289                 break;
3290
3291             DEBUG_PEEP("merg",n,depth);
3292             merged++;
3293
3294             NEXT_OFF(scan) += NEXT_OFF(n);
3295             STR_LEN(scan) += STR_LEN(n);
3296             next = n + NODE_SZ_STR(n);
3297             /* Now we can overwrite *n : */
3298             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3299 #ifdef DEBUGGING
3300             stop = next - 1;
3301 #endif
3302             n = nnext;
3303             if (stopnow) break;
3304         }
3305
3306 #ifdef EXPERIMENTAL_INPLACESCAN
3307         if (flags && !NEXT_OFF(n)) {
3308             DEBUG_PEEP("atch", val, depth);
3309             if (reg_off_by_arg[OP(n)]) {
3310                 ARG_SET(n, val - n);
3311             }
3312             else {
3313                 NEXT_OFF(n) = val - n;
3314             }
3315             stopnow = 1;
3316         }
3317 #endif
3318     }
3319
3320     *min_subtract = 0;
3321     *unfolded_multi_char = FALSE;
3322
3323     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3324      * can now analyze for sequences of problematic code points.  (Prior to
3325      * this final joining, sequences could have been split over boundaries, and
3326      * hence missed).  The sequences only happen in folding, hence for any
3327      * non-EXACT EXACTish node */
3328     if (OP(scan) != EXACT) {
3329         U8* s0 = (U8*) STRING(scan);
3330         U8* s = s0;
3331         U8* s_end = s0 + STR_LEN(scan);
3332
3333         int total_count_delta = 0;  /* Total delta number of characters that
3334                                        multi-char folds expand to */
3335
3336         /* One pass is made over the node's string looking for all the
3337          * possibilities.  To avoid some tests in the loop, there are two main
3338          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3339          * non-UTF-8 */
3340         if (UTF) {
3341             U8* folded = NULL;
3342
3343             if (OP(scan) == EXACTFL) {
3344                 U8 *d;
3345
3346                 /* An EXACTFL node would already have been changed to another
3347                  * node type unless there is at least one character in it that
3348                  * is problematic; likely a character whose fold definition
3349                  * won't be known until runtime, and so has yet to be folded.
3350                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3351                  * to handle the UTF-8 case, we need to create a temporary
3352                  * folded copy using UTF-8 locale rules in order to analyze it.
3353                  * This is because our macros that look to see if a sequence is
3354                  * a multi-char fold assume everything is folded (otherwise the
3355                  * tests in those macros would be too complicated and slow).
3356                  * Note that here, the non-problematic folds will have already
3357                  * been done, so we can just copy such characters.  We actually
3358                  * don't completely fold the EXACTFL string.  We skip the
3359                  * unfolded multi-char folds, as that would just create work
3360                  * below to figure out the size they already are */
3361
3362                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3363                 d = folded;
3364                 while (s < s_end) {
3365                     STRLEN s_len = UTF8SKIP(s);
3366                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3367                         Copy(s, d, s_len, U8);
3368                         d += s_len;
3369                     }
3370                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3371                         *unfolded_multi_char = TRUE;
3372                         Copy(s, d, s_len, U8);
3373                         d += s_len;
3374                     }
3375                     else if (isASCII(*s)) {
3376                         *(d++) = toFOLD(*s);
3377                     }
3378                     else {
3379                         STRLEN len;
3380                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3381                         d += len;
3382                     }
3383                     s += s_len;
3384                 }
3385
3386                 /* Point the remainder of the routine to look at our temporary
3387                  * folded copy */
3388                 s = folded;
3389                 s_end = d;
3390             } /* End of creating folded copy of EXACTFL string */
3391
3392             /* Examine the string for a multi-character fold sequence.  UTF-8
3393              * patterns have all characters pre-folded by the time this code is
3394              * executed */
3395             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3396                                      length sequence we are looking for is 2 */
3397             {
3398                 int count = 0;  /* How many characters in a multi-char fold */
3399                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3400                 if (! len) {    /* Not a multi-char fold: get next char */
3401                     s += UTF8SKIP(s);
3402                     continue;
3403                 }
3404
3405                 /* Nodes with 'ss' require special handling, except for
3406                  * EXACTFA-ish for which there is no multi-char fold to this */
3407                 if (len == 2 && *s == 's' && *(s+1) == 's'
3408                     && OP(scan) != EXACTFA
3409                     && OP(scan) != EXACTFA_NO_TRIE)
3410                 {
3411                     count = 2;
3412                     if (OP(scan) != EXACTFL) {
3413                         OP(scan) = EXACTFU_SS;
3414                     }
3415                     s += 2;
3416                 }
3417                 else { /* Here is a generic multi-char fold. */
3418                     U8* multi_end  = s + len;
3419
3420                     /* Count how many characters in it.  In the case of /aa, no
3421                      * folds which contain ASCII code points are allowed, so
3422                      * check for those, and skip if found. */
3423                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3424                         count = utf8_length(s, multi_end);
3425                         s = multi_end;
3426                     }
3427                     else {
3428                         while (s < multi_end) {
3429                             if (isASCII(*s)) {
3430                                 s++;
3431                                 goto next_iteration;
3432                             }
3433                             else {
3434                                 s += UTF8SKIP(s);
3435                             }
3436                             count++;
3437                         }
3438                     }
3439                 }
3440
3441                 /* The delta is how long the sequence is minus 1 (1 is how long
3442                  * the character that folds to the sequence is) */
3443                 total_count_delta += count - 1;
3444               next_iteration: ;
3445             }
3446
3447             /* We created a temporary folded copy of the string in EXACTFL
3448              * nodes.  Therefore we need to be sure it doesn't go below zero,
3449              * as the real string could be shorter */
3450             if (OP(scan) == EXACTFL) {
3451                 int total_chars = utf8_length((U8*) STRING(scan),
3452                                            (U8*) STRING(scan) + STR_LEN(scan));
3453                 if (total_count_delta > total_chars) {
3454                     total_count_delta = total_chars;
3455                 }
3456             }
3457
3458             *min_subtract += total_count_delta;
3459             Safefree(folded);
3460         }
3461         else if (OP(scan) == EXACTFA) {
3462
3463             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3464              * fold to the ASCII range (and there are no existing ones in the
3465              * upper latin1 range).  But, as outlined in the comments preceding
3466              * this function, we need to flag any occurrences of the sharp s.
3467              * This character forbids trie formation (because of added
3468              * complexity) */
3469             while (s < s_end) {
3470                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3471                     OP(scan) = EXACTFA_NO_TRIE;
3472                     *unfolded_multi_char = TRUE;
3473                     break;
3474                 }
3475                 s++;
3476                 continue;
3477             }
3478         }
3479         else {
3480
3481             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3482              * folds that are all Latin1.  As explained in the comments
3483              * preceding this function, we look also for the sharp s in EXACTF
3484              * and EXACTFL nodes; it can be in the final position.  Otherwise
3485              * we can stop looking 1 byte earlier because have to find at least
3486              * two characters for a multi-fold */
3487             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3488                               ? s_end
3489                               : s_end -1;
3490
3491             while (s < upper) {
3492                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3493                 if (! len) {    /* Not a multi-char fold. */
3494                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3495                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3496                     {
3497                         *unfolded_multi_char = TRUE;
3498                     }
3499                     s++;
3500                     continue;
3501                 }
3502
3503                 if (len == 2
3504                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3505                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3506                 {
3507
3508                     /* EXACTF nodes need to know that the minimum length
3509                      * changed so that a sharp s in the string can match this
3510                      * ss in the pattern, but they remain EXACTF nodes, as they
3511                      * won't match this unless the target string is is UTF-8,
3512                      * which we don't know until runtime.  EXACTFL nodes can't
3513                      * transform into EXACTFU nodes */
3514                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3515                         OP(scan) = EXACTFU_SS;
3516                     }
3517                 }
3518
3519                 *min_subtract += len - 1;
3520                 s += len;
3521             }
3522         }
3523     }
3524
3525 #ifdef DEBUGGING
3526     /* Allow dumping but overwriting the collection of skipped
3527      * ops and/or strings with fake optimized ops */
3528     n = scan + NODE_SZ_STR(scan);
3529     while (n <= stop) {
3530         OP(n) = OPTIMIZED;
3531         FLAGS(n) = 0;
3532         NEXT_OFF(n) = 0;
3533         n++;
3534     }
3535 #endif
3536     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3537     return stopnow;
3538 }
3539
3540 /* REx optimizer.  Converts nodes into quicker variants "in place".
3541    Finds fixed substrings.  */
3542
3543 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3544    to the position after last scanned or to NULL. */
3545
3546 #define INIT_AND_WITHP \
3547     assert(!and_withp); \
3548     Newx(and_withp,1, regnode_ssc); \
3549     SAVEFREEPV(and_withp)
3550
3551 /* this is a chain of data about sub patterns we are processing that
3552    need to be handled separately/specially in study_chunk. Its so
3553    we can simulate recursion without losing state.  */
3554 struct scan_frame;
3555 typedef struct scan_frame {
3556     regnode *last;  /* last node to process in this frame */
3557     regnode *next;  /* next node to process when last is reached */
3558     struct scan_frame *prev; /*previous frame*/
3559     U32 prev_recursed_depth;
3560     I32 stop; /* what stopparen do we use */
3561 } scan_frame;
3562
3563
3564 STATIC SSize_t
3565 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3566                         SSize_t *minlenp, SSize_t *deltap,
3567                         regnode *last,
3568                         scan_data_t *data,
3569                         I32 stopparen,
3570                         U32 recursed_depth,
3571                         regnode_ssc *and_withp,
3572                         U32 flags, U32 depth)
3573                         /* scanp: Start here (read-write). */
3574                         /* deltap: Write maxlen-minlen here. */
3575                         /* last: Stop before this one. */
3576                         /* data: string data about the pattern */
3577                         /* stopparen: treat close N as END */
3578                         /* recursed: which subroutines have we recursed into */
3579                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3580 {
3581     dVAR;
3582     /* There must be at least this number of characters to match */
3583     SSize_t min = 0;
3584     I32 pars = 0, code;
3585     regnode *scan = *scanp, *next;
3586     SSize_t delta = 0;
3587     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3588     int is_inf_internal = 0;            /* The studied chunk is infinite */
3589     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3590     scan_data_t data_fake;
3591     SV *re_trie_maxbuff = NULL;
3592     regnode *first_non_open = scan;
3593     SSize_t stopmin = SSize_t_MAX;
3594     scan_frame *frame = NULL;
3595     GET_RE_DEBUG_FLAGS_DECL;
3596
3597     PERL_ARGS_ASSERT_STUDY_CHUNK;
3598
3599 #ifdef DEBUGGING
3600     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3601 #endif
3602     if ( depth == 0 ) {
3603         while (first_non_open && OP(first_non_open) == OPEN)
3604             first_non_open=regnext(first_non_open);
3605     }
3606
3607
3608   fake_study_recurse:
3609     while ( scan && OP(scan) != END && scan < last ){
3610         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3611                                    node length to get a real minimum (because
3612                                    the folded version may be shorter) */
3613         bool unfolded_multi_char = FALSE;
3614         /* Peephole optimizer: */
3615         DEBUG_OPTIMISE_MORE_r(
3616         {
3617             PerlIO_printf(Perl_debug_log,
3618                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3619                 ((int) depth*2), "", (long)stopparen,
3620                 (unsigned long)depth, (unsigned long)recursed_depth);
3621             if (recursed_depth) {
3622                 U32 i;
3623                 U32 j;
3624                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3625                     PerlIO_printf(Perl_debug_log,"[");
3626                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3627                         PerlIO_printf(Perl_debug_log,"%d",
3628                             PAREN_TEST(RExC_study_chunk_recursed +
3629                                        (j * RExC_study_chunk_recursed_bytes), i)
3630                             ? 1 : 0
3631                         );
3632                     PerlIO_printf(Perl_debug_log,"]");
3633                 }
3634             }
3635             PerlIO_printf(Perl_debug_log,"\n");
3636         }
3637         );
3638         DEBUG_STUDYDATA("Peep:", data, depth);
3639         DEBUG_PEEP("Peep", scan, depth);
3640
3641
3642         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3643          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3644          * by a different invocation of reg() -- Yves
3645          */
3646         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3647
3648         /* Follow the next-chain of the current node and optimize
3649            away all the NOTHINGs from it.  */
3650         if (OP(scan) != CURLYX) {
3651             const int max = (reg_off_by_arg[OP(scan)]
3652                        ? I32_MAX
3653                        /* I32 may be smaller than U16 on CRAYs! */
3654                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3655             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3656             int noff;
3657             regnode *n = scan;
3658
3659             /* Skip NOTHING and LONGJMP. */
3660             while ((n = regnext(n))
3661                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3662                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3663                    && off + noff < max)
3664                 off += noff;
3665             if (reg_off_by_arg[OP(scan)])
3666                 ARG(scan) = off;
3667             else
3668                 NEXT_OFF(scan) = off;
3669         }
3670
3671
3672
3673         /* The principal pseudo-switch.  Cannot be a switch, since we
3674            look into several different things.  */
3675         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3676                    || OP(scan) == IFTHEN) {
3677             next = regnext(scan);
3678             code = OP(scan);
3679             /* demq: the op(next)==code check is to see if we have
3680              * "branch-branch" AFAICT */
3681
3682             if (OP(next) == code || code == IFTHEN) {
3683                 /* NOTE - There is similar code to this block below for
3684                  * handling TRIE nodes on a re-study.  If you change stuff here
3685                  * check there too. */
3686                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3687                 regnode_ssc accum;
3688                 regnode * const startbranch=scan;
3689
3690                 if (flags & SCF_DO_SUBSTR) {
3691                     /* Cannot merge strings after this. */
3692                     scan_commit(pRExC_state, data, minlenp, is_inf);
3693                 }
3694
3695                 if (flags & SCF_DO_STCLASS)
3696                     ssc_init_zero(pRExC_state, &accum);
3697
3698                 while (OP(scan) == code) {
3699                     SSize_t deltanext, minnext, fake;
3700                     I32 f = 0;
3701                     regnode_ssc this_class;
3702
3703                     num++;
3704                     data_fake.flags = 0;
3705                     if (data) {
3706                         data_fake.whilem_c = data->whilem_c;
3707                         data_fake.last_closep = data->last_closep;
3708                     }
3709                     else
3710                         data_fake.last_closep = &fake;
3711
3712                     data_fake.pos_delta = delta;
3713                     next = regnext(scan);
3714                     scan = NEXTOPER(scan);
3715                     if (code != BRANCH)
3716                         scan = NEXTOPER(scan);
3717                     if (flags & SCF_DO_STCLASS) {
3718                         ssc_init(pRExC_state, &this_class);
3719                         data_fake.start_class = &this_class;
3720                         f = SCF_DO_STCLASS_AND;
3721                     }
3722                     if (flags & SCF_WHILEM_VISITED_POS)
3723                         f |= SCF_WHILEM_VISITED_POS;
3724
3725                     /* we suppose the run is continuous, last=next...*/
3726                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3727                                       &deltanext, next, &data_fake, stopparen,
3728                                       recursed_depth, NULL, f,depth+1);
3729                     if (min1 > minnext)
3730                         min1 = minnext;
3731                     if (deltanext == SSize_t_MAX) {
3732                         is_inf = is_inf_internal = 1;
3733                         max1 = SSize_t_MAX;
3734                     } else if (max1 < minnext + deltanext)
3735                         max1 = minnext + deltanext;
3736                     scan = next;
3737                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3738                         pars++;
3739                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3740                         if ( stopmin > minnext)
3741                             stopmin = min + min1;
3742                         flags &= ~SCF_DO_SUBSTR;
3743                         if (data)
3744                             data->flags |= SCF_SEEN_ACCEPT;
3745                     }
3746                     if (data) {
3747                         if (data_fake.flags & SF_HAS_EVAL)
3748                             data->flags |= SF_HAS_EVAL;
3749                         data->whilem_c = data_fake.whilem_c;
3750                     }
3751                     if (flags & SCF_DO_STCLASS)
3752                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3753                 }
3754                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3755                     min1 = 0;
3756                 if (flags & SCF_DO_SUBSTR) {
3757                     data->pos_min += min1;
3758                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3759                         data->pos_delta = SSize_t_MAX;
3760                     else
3761                         data->pos_delta += max1 - min1;
3762                     if (max1 != min1 || is_inf)
3763                         data->longest = &(data->longest_float);
3764                 }
3765                 min += min1;
3766                 if (delta == SSize_t_MAX
3767                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3768                     delta = SSize_t_MAX;
3769                 else
3770                     delta += max1 - min1;
3771                 if (flags & SCF_DO_STCLASS_OR) {
3772                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3773                     if (min1) {
3774                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3775                         flags &= ~SCF_DO_STCLASS;
3776                     }
3777                 }
3778                 else if (flags & SCF_DO_STCLASS_AND) {
3779                     if (min1) {
3780                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3781                         flags &= ~SCF_DO_STCLASS;
3782                     }
3783                     else {
3784                         /* Switch to OR mode: cache the old value of
3785                          * data->start_class */
3786                         INIT_AND_WITHP;
3787                         StructCopy(data->start_class, and_withp, regnode_ssc);
3788                         flags &= ~SCF_DO_STCLASS_AND;
3789                         StructCopy(&accum, data->start_class, regnode_ssc);
3790                         flags |= SCF_DO_STCLASS_OR;
3791                     }
3792                 }
3793
3794                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3795                         OP( startbranch ) == BRANCH )
3796                 {
3797                 /* demq.
3798
3799                    Assuming this was/is a branch we are dealing with: 'scan'
3800                    now points at the item that follows the branch sequence,
3801                    whatever it is. We now start at the beginning of the
3802                    sequence and look for subsequences of
3803
3804                    BRANCH->EXACT=>x1
3805                    BRANCH->EXACT=>x2
3806                    tail
3807
3808                    which would be constructed from a pattern like
3809                    /A|LIST|OF|WORDS/
3810
3811                    If we can find such a subsequence we need to turn the first
3812                    element into a trie and then add the subsequent branch exact
3813                    strings to the trie.
3814
3815                    We have two cases
3816
3817                      1. patterns where the whole set of branches can be
3818                         converted.
3819
3820                      2. patterns where only a subset can be converted.
3821
3822                    In case 1 we can replace the whole set with a single regop
3823                    for the trie. In case 2 we need to keep the start and end
3824                    branches so
3825
3826                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3827                      becomes BRANCH TRIE; BRANCH X;
3828
3829                   There is an additional case, that being where there is a
3830                   common prefix, which gets split out into an EXACT like node
3831                   preceding the TRIE node.
3832
3833                   If x(1..n)==tail then we can do a simple trie, if not we make
3834                   a "jump" trie, such that when we match the appropriate word
3835                   we "jump" to the appropriate tail node. Essentially we turn
3836                   a nested if into a case structure of sorts.
3837
3838                 */
3839
3840                     int made=0;
3841                     if (!re_trie_maxbuff) {
3842                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3843                         if (!SvIOK(re_trie_maxbuff))
3844                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3845                     }
3846                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3847                         regnode *cur;
3848                         regnode *first = (regnode *)NULL;
3849                         regnode *last = (regnode *)NULL;
3850                         regnode *tail = scan;
3851                         U8 trietype = 0;
3852                         U32 count=0;
3853
3854 #ifdef DEBUGGING
3855                         SV * const mysv = sv_newmortal();   /* for dumping */
3856 #endif
3857                         /* var tail is used because there may be a TAIL
3858                            regop in the way. Ie, the exacts will point to the
3859                            thing following the TAIL, but the last branch will
3860                            point at the TAIL. So we advance tail. If we
3861                            have nested (?:) we may have to move through several
3862                            tails.
3863                          */
3864
3865                         while ( OP( tail ) == TAIL ) {
3866                             /* this is the TAIL generated by (?:) */
3867                             tail = regnext( tail );
3868                         }
3869
3870
3871                         DEBUG_TRIE_COMPILE_r({
3872                             regprop(RExC_rx, mysv, tail, NULL);
3873                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3874                               (int)depth * 2 + 2, "",
3875                               "Looking for TRIE'able sequences. Tail node is: ",
3876                               SvPV_nolen_const( mysv )
3877                             );
3878                         });
3879
3880                         /*
3881
3882                             Step through the branches
3883                                 cur represents each branch,
3884                                 noper is the first thing to be matched as part
3885                                       of that branch
3886                                 noper_next is the regnext() of that node.
3887
3888                             We normally handle a case like this
3889                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3890                             support building with NOJUMPTRIE, which restricts
3891                             the trie logic to structures like /FOO|BAR/.
3892
3893                             If noper is a trieable nodetype then the branch is
3894                             a possible optimization target. If we are building
3895                             under NOJUMPTRIE then we require that noper_next is
3896                             the same as scan (our current position in the regex
3897                             program).
3898
3899                             Once we have two or more consecutive such branches
3900                             we can create a trie of the EXACT's contents and
3901                             stitch it in place into the program.
3902
3903                             If the sequence represents all of the branches in
3904                             the alternation we replace the entire thing with a
3905                             single TRIE node.
3906
3907                             Otherwise when it is a subsequence we need to
3908                             stitch it in place and replace only the relevant
3909                             branches. This means the first branch has to remain
3910                             as it is used by the alternation logic, and its
3911                             next pointer, and needs to be repointed at the item
3912                             on the branch chain following the last branch we
3913                             have optimized away.
3914
3915                             This could be either a BRANCH, in which case the
3916                             subsequence is internal, or it could be the item
3917                             following the branch sequence in which case the
3918                             subsequence is at the end (which does not
3919                             necessarily mean the first node is the start of the
3920                             alternation).
3921
3922                             TRIE_TYPE(X) is a define which maps the optype to a
3923                             trietype.
3924
3925                                 optype          |  trietype
3926                                 ----------------+-----------
3927                                 NOTHING         | NOTHING
3928                                 EXACT           | EXACT
3929                                 EXACTFU         | EXACTFU
3930                                 EXACTFU_SS      | EXACTFU
3931                                 EXACTFA         | EXACTFA
3932
3933
3934                         */
3935 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3936                        ( EXACT == (X) )   ? EXACT :        \
3937                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3938                        ( EXACTFA == (X) ) ? EXACTFA :        \
3939                        0 )
3940
3941                         /* dont use tail as the end marker for this traverse */
3942                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3943                             regnode * const noper = NEXTOPER( cur );
3944                             U8 noper_type = OP( noper );
3945                             U8 noper_trietype = TRIE_TYPE( noper_type );
3946 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3947                             regnode * const noper_next = regnext( noper );
3948                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3949                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3950 #endif
3951
3952                             DEBUG_TRIE_COMPILE_r({
3953                                 regprop(RExC_rx, mysv, cur, NULL);
3954                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3955                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3956
3957                                 regprop(RExC_rx, mysv, noper, NULL);
3958                                 PerlIO_printf( Perl_debug_log, " -> %s",
3959                                     SvPV_nolen_const(mysv));
3960
3961                                 if ( noper_next ) {
3962                                   regprop(RExC_rx, mysv, noper_next, NULL);
3963                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3964                                     SvPV_nolen_const(mysv));
3965                                 }
3966                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3967                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3968                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3969                                 );
3970                             });
3971
3972                             /* Is noper a trieable nodetype that can be merged
3973                              * with the current trie (if there is one)? */
3974                             if ( noper_trietype
3975                                   &&
3976                                   (
3977                                         ( noper_trietype == NOTHING)
3978                                         || ( trietype == NOTHING )
3979                                         || ( trietype == noper_trietype )
3980                                   )
3981 #ifdef NOJUMPTRIE
3982                                   && noper_next == tail
3983 #endif
3984                                   && count < U16_MAX)
3985                             {
3986                                 /* Handle mergable triable node Either we are
3987                                  * the first node in a new trieable sequence,
3988                                  * in which case we do some bookkeeping,
3989                                  * otherwise we update the end pointer. */
3990                                 if ( !first ) {
3991                                     first = cur;
3992                                     if ( noper_trietype == NOTHING ) {
3993 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3994                                         regnode * const noper_next = regnext( noper );
3995                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3996                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3997 #endif
3998
3999                                         if ( noper_next_trietype ) {
4000                                             trietype = noper_next_trietype;
4001                                         } else if (noper_next_type)  {
4002                                             /* a NOTHING regop is 1 regop wide.
4003                                              * We need at least two for a trie
4004                                              * so we can't merge this in */
4005                                             first = NULL;
4006                                         }
4007                                     } else {
4008                                         trietype = noper_trietype;
4009                                     }
4010                                 } else {
4011                                     if ( trietype == NOTHING )
4012                                         trietype = noper_trietype;
4013                                     last = cur;
4014                                 }
4015                                 if (first)
4016                                     count++;
4017                             } /* end handle mergable triable node */
4018                             else {
4019                                 /* handle unmergable node -
4020                                  * noper may either be a triable node which can
4021                                  * not be tried together with the current trie,
4022                                  * or a non triable node */
4023                                 if ( last ) {
4024                                     /* If last is set and trietype is not
4025                                      * NOTHING then we have found at least two
4026                                      * triable branch sequences in a row of a
4027                                      * similar trietype so we can turn them
4028                                      * into a trie. If/when we allow NOTHING to
4029                                      * start a trie sequence this condition
4030                                      * will be required, and it isn't expensive
4031                                      * so we leave it in for now. */
4032                                     if ( trietype && trietype != NOTHING )
4033                                         make_trie( pRExC_state,
4034                                                 startbranch, first, cur, tail,
4035                                                 count, trietype, depth+1 );
4036                                     last = NULL; /* note: we clear/update
4037                                                     first, trietype etc below,
4038                                                     so we dont do it here */
4039                                 }
4040                                 if ( noper_trietype
4041 #ifdef NOJUMPTRIE
4042                                      && noper_next == tail
4043 #endif
4044                                 ){
4045                                     /* noper is triable, so we can start a new
4046                                      * trie sequence */
4047                                     count = 1;
4048                                     first = cur;
4049                                     trietype = noper_trietype;
4050                                 } else if (first) {
4051                                     /* if we already saw a first but the
4052                                      * current node is not triable then we have
4053                                      * to reset the first information. */
4054                                     count = 0;
4055                                     first = NULL;
4056                                     trietype = 0;
4057                                 }
4058                             } /* end handle unmergable node */
4059                         } /* loop over branches */
4060                         DEBUG_TRIE_COMPILE_r({
4061                             regprop(RExC_rx, mysv, cur, NULL);
4062                             PerlIO_printf( Perl_debug_log,
4063                               "%*s- %s (%d) <SCAN FINISHED>\n",
4064                               (int)depth * 2 + 2,
4065                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4066
4067                         });
4068                         if ( last && trietype ) {
4069                             if ( trietype != NOTHING ) {
4070                                 /* the last branch of the sequence was part of
4071                                  * a trie, so we have to construct it here
4072                                  * outside of the loop */
4073                                 made= make_trie( pRExC_state, startbranch,
4074                                                  first, scan, tail, count,
4075                                                  trietype, depth+1 );
4076 #ifdef TRIE_STUDY_OPT
4077                                 if ( ((made == MADE_EXACT_TRIE &&
4078                                      startbranch == first)
4079                                      || ( first_non_open == first )) &&
4080                                      depth==0 ) {
4081                                     flags |= SCF_TRIE_RESTUDY;
4082                                     if ( startbranch == first
4083                                          && scan == tail )
4084                                     {
4085                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4086                                     }
4087                                 }
4088 #endif
4089                             } else {
4090                                 /* at this point we know whatever we have is a
4091                                  * NOTHING sequence/branch AND if 'startbranch'
4092                                  * is 'first' then we can turn the whole thing
4093                                  * into a NOTHING
4094                                  */
4095                                 if ( startbranch == first ) {
4096                                     regnode *opt;
4097                                     /* the entire thing is a NOTHING sequence,
4098                                      * something like this: (?:|) So we can
4099                                      * turn it into a plain NOTHING op. */
4100                                     DEBUG_TRIE_COMPILE_r({
4101                                         regprop(RExC_rx, mysv, cur, NULL);
4102                                         PerlIO_printf( Perl_debug_log,
4103                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4104                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4105
4106                                     });
4107                                     OP(startbranch)= NOTHING;
4108                                     NEXT_OFF(startbranch)= tail - startbranch;
4109                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4110                                         OP(opt)= OPTIMIZED;
4111                                 }
4112                             }
4113                         } /* end if ( last) */
4114                     } /* TRIE_MAXBUF is non zero */
4115
4116                 } /* do trie */
4117
4118             }
4119             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4120                 scan = NEXTOPER(NEXTOPER(scan));
4121             } else                      /* single branch is optimized. */
4122                 scan = NEXTOPER(scan);
4123             continue;
4124         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4125             scan_frame *newframe = NULL;
4126             I32 paren;
4127             regnode *start;
4128             regnode *end;
4129             U32 my_recursed_depth= recursed_depth;
4130
4131             if (OP(scan) != SUSPEND) {
4132                 /* set the pointer */
4133                 if (OP(scan) == GOSUB) {
4134                     paren = ARG(scan);
4135                     RExC_recurse[ARG2L(scan)] = scan;
4136                     start = RExC_open_parens[paren-1];
4137                     end   = RExC_close_parens[paren-1];
4138                 } else {
4139                     paren = 0;
4140                     start = RExC_rxi->program + 1;
4141                     end   = RExC_opend;
4142                 }
4143                 if (!recursed_depth
4144                     ||
4145                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4146                 ) {
4147                     if (!recursed_depth) {
4148                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4149                     } else {
4150                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4152                              RExC_study_chunk_recursed_bytes, U8);
4153                     }
4154                     /* we havent recursed into this paren yet, so recurse into it */
4155                     DEBUG_STUDYDATA("set:", data,depth);
4156                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4157                     my_recursed_depth= recursed_depth + 1;
4158                     Newx(newframe,1,scan_frame);
4159                 } else {
4160                     DEBUG_STUDYDATA("inf:", data,depth);
4161                     /* some form of infinite recursion, assume infinite length
4162                      * */
4163                     if (flags & SCF_DO_SUBSTR) {
4164                         scan_commit(pRExC_state, data, minlenp, is_inf);
4165                         data->longest = &(data->longest_float);
4166                     }
4167                     is_inf = is_inf_internal = 1;
4168                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4169                         ssc_anything(data->start_class);
4170                     flags &= ~SCF_DO_STCLASS;
4171                 }
4172             } else {
4173                 Newx(newframe,1,scan_frame);
4174                 paren = stopparen;
4175                 start = scan+2;
4176                 end = regnext(scan);
4177             }
4178             if (newframe) {
4179                 assert(start);
4180                 assert(end);
4181                 SAVEFREEPV(newframe);
4182                 newframe->next = regnext(scan);
4183                 newframe->last = last;
4184                 newframe->stop = stopparen;
4185                 newframe->prev = frame;
4186                 newframe->prev_recursed_depth = recursed_depth;
4187
4188                 DEBUG_STUDYDATA("frame-new:",data,depth);
4189                 DEBUG_PEEP("fnew", scan, depth);
4190
4191                 frame = newframe;
4192                 scan =  start;
4193                 stopparen = paren;
4194                 last = end;
4195                 depth = depth + 1;
4196                 recursed_depth= my_recursed_depth;
4197
4198                 continue;
4199             }
4200         }
4201         else if (OP(scan) == EXACT) {
4202             SSize_t l = STR_LEN(scan);
4203             UV uc;
4204             if (UTF) {
4205                 const U8 * const s = (U8*)STRING(scan);
4206                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4207                 l = utf8_length(s, s + l);
4208             } else {
4209                 uc = *((U8*)STRING(scan));
4210             }
4211             min += l;
4212             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4213                 /* The code below prefers earlier match for fixed
4214                    offset, later match for variable offset.  */
4215                 if (data->last_end == -1) { /* Update the start info. */
4216                     data->last_start_min = data->pos_min;
4217                     data->last_start_max = is_inf
4218                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4219                 }
4220                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4221                 if (UTF)
4222                     SvUTF8_on(data->last_found);
4223                 {
4224                     SV * const sv = data->last_found;
4225                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4226                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4227                     if (mg && mg->mg_len >= 0)
4228                         mg->mg_len += utf8_length((U8*)STRING(scan),
4229                                               (U8*)STRING(scan)+STR_LEN(scan));
4230                 }
4231                 data->last_end = data->pos_min + l;
4232                 data->pos_min += l; /* As in the first entry. */
4233                 data->flags &= ~SF_BEFORE_EOL;
4234             }
4235
4236             /* ANDing the code point leaves at most it, and not in locale, and
4237              * can't match null string */
4238             if (flags & SCF_DO_STCLASS_AND) {
4239                 ssc_cp_and(data->start_class, uc);
4240                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4241                 ssc_clear_locale(data->start_class);
4242             }
4243             else if (flags & SCF_DO_STCLASS_OR) {
4244                 ssc_add_cp(data->start_class, uc);
4245                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4246
4247                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4248                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4249             }
4250             flags &= ~SCF_DO_STCLASS;
4251         }
4252         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4253             SSize_t l = STR_LEN(scan);
4254             UV uc = *((U8*)STRING(scan));
4255             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4256                                                      separate code points */
4257
4258             /* Search for fixed substrings supports EXACT only. */
4259             if (flags & SCF_DO_SUBSTR) {
4260                 assert(data);
4261                 scan_commit(pRExC_state, data, minlenp, is_inf);
4262             }
4263             if (UTF) {
4264                 const U8 * const s = (U8 *)STRING(scan);
4265                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4266                 l = utf8_length(s, s + l);
4267             }
4268             if (unfolded_multi_char) {
4269                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4270             }
4271             min += l - min_subtract;
4272             assert (min >= 0);
4273             delta += min_subtract;
4274             if (flags & SCF_DO_SUBSTR) {
4275                 data->pos_min += l - min_subtract;
4276                 if (data->pos_min < 0) {
4277                     data->pos_min = 0;
4278                 }
4279                 data->pos_delta += min_subtract;
4280                 if (min_subtract) {
4281                     data->longest = &(data->longest_float);
4282                 }
4283             }
4284             if (OP(scan) == EXACTFL) {
4285
4286                 /* We don't know what the folds are; it could be anything. XXX
4287                  * Actually, we only support UTF-8 encoding for code points
4288                  * above Latin1, so we could know what those folds are. */
4289                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4290                                                        0,
4291                                                        UV_MAX);
4292             }
4293             else {  /* Non-locale EXACTFish */
4294                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4295                 if (flags & SCF_DO_STCLASS_AND) {
4296                     ssc_clear_locale(data->start_class);
4297                 }
4298                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4299                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4300                                                        know if anything folds
4301                                                        with this */
4302                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4303                                                            PL_fold_latin1[uc]);
4304                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4305                                                       legal under /iaa */
4306                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4307                                 EXACTF_invlist
4308                                     = add_cp_to_invlist(EXACTF_invlist,
4309                                                 LATIN_SMALL_LETTER_SHARP_S);
4310                             }
4311                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4312                                 EXACTF_invlist
4313                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4314                                 EXACTF_invlist
4315                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4316                             }
4317                         }
4318
4319                         /* We also know if there are above-Latin1 code points
4320                          * that fold to this (none legal for ASCII and /iaa) */
4321                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4322                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4323                         {
4324                             /* XXX We could know exactly what does fold to this
4325                              * if the reverse folds are loaded, as currently in
4326                              * S_regclass() */
4327                             _invlist_union(EXACTF_invlist,
4328                                            PL_AboveLatin1,
4329                                            &EXACTF_invlist);
4330                         }
4331                     }
4332                 }
4333                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4334                            know what participates in folds with this, so have
4335                            to assume anything could */
4336
4337                     /* XXX We could know exactly what does fold to this if the
4338                      * reverse folds are loaded, as currently in S_regclass().
4339                      * But we do know that under /iaa nothing in the ASCII
4340                      * range can participate */
4341                     if (OP(scan) == EXACTFA) {
4342                         _invlist_union_complement_2nd(EXACTF_invlist,
4343                                                       PL_XPosix_ptrs[_CC_ASCII],
4344                                                       &EXACTF_invlist);
4345                     }
4346                     else {
4347                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4348                                                                0, UV_MAX);
4349                     }
4350                 }
4351             }
4352             if (flags & SCF_DO_STCLASS_AND) {
4353                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4354                 ANYOF_POSIXL_ZERO(data->start_class);
4355                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4356             }
4357             else if (flags & SCF_DO_STCLASS_OR) {
4358                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4359                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4360
4361                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4362                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4363             }
4364             flags &= ~SCF_DO_STCLASS;
4365             SvREFCNT_dec(EXACTF_invlist);
4366         }
4367         else if (REGNODE_VARIES(OP(scan))) {
4368             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4369             I32 fl = 0, f = flags;
4370             regnode * const oscan = scan;
4371             regnode_ssc this_class;
4372             regnode_ssc *oclass = NULL;
4373             I32 next_is_eval = 0;
4374
4375             switch (PL_regkind[OP(scan)]) {
4376             case WHILEM:                /* End of (?:...)* . */
4377                 scan = NEXTOPER(scan);
4378                 goto finish;
4379             case PLUS:
4380                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4381                     next = NEXTOPER(scan);
4382                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4383                         mincount = 1;
4384                         maxcount = REG_INFTY;
4385                         next = regnext(scan);
4386                         scan = NEXTOPER(scan);
4387                         goto do_curly;
4388                     }
4389                 }
4390                 if (flags & SCF_DO_SUBSTR)
4391                     data->pos_min++;
4392                 min++;
4393                 /* FALLTHROUGH */
4394             case STAR:
4395                 if (flags & SCF_DO_STCLASS) {
4396                     mincount = 0;
4397                     maxcount = REG_INFTY;
4398                     next = regnext(scan);
4399                     scan = NEXTOPER(scan);
4400                     goto do_curly;
4401                 }
4402                 if (flags & SCF_DO_SUBSTR) {
4403                     scan_commit(pRExC_state, data, minlenp, is_inf);
4404                     /* Cannot extend fixed substrings */
4405                     data->longest = &(data->longest_float);
4406                 }
4407                 is_inf = is_inf_internal = 1;
4408                 scan = regnext(scan);
4409                 goto optimize_curly_tail;
4410             case CURLY:
4411                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4412                     && (scan->flags == stopparen))
4413                 {
4414                     mincount = 1;
4415                     maxcount = 1;
4416                 } else {
4417                     mincount = ARG1(scan);
4418                     maxcount = ARG2(scan);
4419                 }
4420                 next = regnext(scan);
4421                 if (OP(scan) == CURLYX) {
4422                     I32 lp = (data ? *(data->last_closep) : 0);
4423                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4424                 }
4425                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4426                 next_is_eval = (OP(scan) == EVAL);
4427               do_curly:
4428                 if (flags & SCF_DO_SUBSTR) {
4429                     if (mincount == 0)
4430                         scan_commit(pRExC_state, data, minlenp, is_inf);
4431                     /* Cannot extend fixed substrings */
4432                     pos_before = data->pos_min;
4433                 }
4434                 if (data) {
4435                     fl = data->flags;
4436                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4437                     if (is_inf)
4438                         data->flags |= SF_IS_INF;
4439                 }
4440                 if (flags & SCF_DO_STCLASS) {
4441                     ssc_init(pRExC_state, &this_class);
4442                     oclass = data->start_class;
4443                     data->start_class = &this_class;
4444                     f |= SCF_DO_STCLASS_AND;
4445                     f &= ~SCF_DO_STCLASS_OR;
4446                 }
4447                 /* Exclude from super-linear cache processing any {n,m}
4448                    regops for which the combination of input pos and regex
4449                    pos is not enough information to determine if a match
4450                    will be possible.
4451
4452                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4453                    regex pos at the \s*, the prospects for a match depend not
4454                    only on the input position but also on how many (bar\s*)
4455                    repeats into the {4,8} we are. */
4456                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4457                     f &= ~SCF_WHILEM_VISITED_POS;
4458
4459                 /* This will finish on WHILEM, setting scan, or on NULL: */
4460                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4461                                   last, data, stopparen, recursed_depth, NULL,
4462                                   (mincount == 0
4463                                    ? (f & ~SCF_DO_SUBSTR)
4464                                    : f)
4465                                   ,depth+1);
4466
4467                 if (flags & SCF_DO_STCLASS)
4468                     data->start_class = oclass;
4469                 if (mincount == 0 || minnext == 0) {
4470                     if (flags & SCF_DO_STCLASS_OR) {
4471                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4472                     }
4473                     else if (flags & SCF_DO_STCLASS_AND) {
4474                         /* Switch to OR mode: cache the old value of
4475                          * data->start_class */
4476                         INIT_AND_WITHP;
4477                         StructCopy(data->start_class, and_withp, regnode_ssc);
4478                         flags &= ~SCF_DO_STCLASS_AND;
4479                         StructCopy(&this_class, data->start_class, regnode_ssc);
4480                         flags |= SCF_DO_STCLASS_OR;
4481                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4482                     }
4483                 } else {                /* Non-zero len */
4484                     if (flags & SCF_DO_STCLASS_OR) {
4485                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4486                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4487                     }
4488                     else if (flags & SCF_DO_STCLASS_AND)
4489                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4490                     flags &= ~SCF_DO_STCLASS;
4491                 }
4492                 if (!scan)              /* It was not CURLYX, but CURLY. */
4493                     scan = next;
4494                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4495                     /* ? quantifier ok, except for (?{ ... }) */
4496                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4497                     && (minnext == 0) && (deltanext == 0)
4498                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4499                     && maxcount <= REG_INFTY/3) /* Complement check for big
4500                                                    count */
4501                 {
4502                     /* Fatal warnings may leak the regexp without this: */
4503                     SAVEFREESV(RExC_rx_sv);
4504                     ckWARNreg(RExC_parse,
4505                             "Quantifier unexpected on zero-length expression");
4506                     (void)ReREFCNT_inc(RExC_rx_sv);
4507                 }
4508
4509                 min += minnext * mincount;
4510                 is_inf_internal |= deltanext == SSize_t_MAX
4511                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4512                 is_inf |= is_inf_internal;
4513                 if (is_inf) {
4514                     delta = SSize_t_MAX;
4515                 } else {
4516                     delta += (minnext + deltanext) * maxcount
4517                              - minnext * mincount;
4518                 }
4519                 /* Try powerful optimization CURLYX => CURLYN. */
4520                 if (  OP(oscan) == CURLYX && data
4521                       && data->flags & SF_IN_PAR
4522                       && !(data->flags & SF_HAS_EVAL)
4523                       && !deltanext && minnext == 1 ) {
4524                     /* Try to optimize to CURLYN.  */
4525                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4526                     regnode * const nxt1 = nxt;
4527 #ifdef DEBUGGING
4528                     regnode *nxt2;
4529 #endif
4530
4531                     /* Skip open. */
4532                     nxt = regnext(nxt);
4533                     if (!REGNODE_SIMPLE(OP(nxt))
4534                         && !(PL_regkind[OP(nxt)] == EXACT
4535                              && STR_LEN(nxt) == 1))
4536                         goto nogo;
4537 #ifdef DEBUGGING
4538                     nxt2 = nxt;
4539 #endif
4540                     nxt = regnext(nxt);
4541                     if (OP(nxt) != CLOSE)
4542                         goto nogo;
4543                     if (RExC_open_parens) {
4544                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4545                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4546                     }
4547                     /* Now we know that nxt2 is the only contents: */
4548                     oscan->flags = (U8)ARG(nxt);
4549                     OP(oscan) = CURLYN;
4550                     OP(nxt1) = NOTHING; /* was OPEN. */
4551
4552 #ifdef DEBUGGING
4553                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4554                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4555                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4556                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4557                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4558                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4559 #endif
4560                 }
4561               nogo:
4562
4563                 /* Try optimization CURLYX => CURLYM. */
4564                 if (  OP(oscan) == CURLYX && data
4565                       && !(data->flags & SF_HAS_PAR)
4566                       && !(data->flags & SF_HAS_EVAL)
4567                       && !deltanext     /* atom is fixed width */
4568                       && minnext != 0   /* CURLYM can't handle zero width */
4569
4570                          /* Nor characters whose fold at run-time may be
4571                           * multi-character */
4572                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4573                 ) {
4574                     /* XXXX How to optimize if data == 0? */
4575                     /* Optimize to a simpler form.  */
4576                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4577                     regnode *nxt2;
4578
4579                     OP(oscan) = CURLYM;
4580                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4581                             && (OP(nxt2) != WHILEM))
4582                         nxt = nxt2;
4583                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4584                     /* Need to optimize away parenths. */
4585                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4586                         /* Set the parenth number.  */
4587                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4588
4589                         oscan->flags = (U8)ARG(nxt);
4590                         if (RExC_open_parens) {
4591                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4592                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4593                         }
4594                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4595                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4596
4597 #ifdef DEBUGGING
4598                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4599                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4600                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4601                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4602 #endif
4603 #if 0
4604                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4605                             regnode *nnxt = regnext(nxt1);
4606                             if (nnxt == nxt) {
4607                                 if (reg_off_by_arg[OP(nxt1)])
4608                                     ARG_SET(nxt1, nxt2 - nxt1);
4609                                 else if (nxt2 - nxt1 < U16_MAX)
4610                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4611                                 else
4612                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4613                             }
4614                             nxt1 = nnxt;
4615                         }
4616 #endif
4617                         /* Optimize again: */
4618                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4619                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4620                     }
4621                     else
4622                         oscan->flags = 0;
4623                 }
4624                 else if ((OP(oscan) == CURLYX)
4625                          && (flags & SCF_WHILEM_VISITED_POS)
4626                          /* See the comment on a similar expression above.
4627                             However, this time it's not a subexpression
4628                             we care about, but the expression itself. */
4629                          && (maxcount == REG_INFTY)
4630                          && data && ++data->whilem_c < 16) {
4631                     /* This stays as CURLYX, we can put the count/of pair. */
4632                     /* Find WHILEM (as in regexec.c) */
4633                     regnode *nxt = oscan + NEXT_OFF(oscan);
4634
4635                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4636                         nxt += ARG(nxt);
4637                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4638                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4639                 }
4640                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4641                     pars++;
4642                 if (flags & SCF_DO_SUBSTR) {
4643                     SV *last_str = NULL;
4644                     STRLEN last_chrs = 0;
4645                     int counted = mincount != 0;
4646
4647                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4648                                                                   string. */
4649                         SSize_t b = pos_before >= data->last_start_min
4650                             ? pos_before : data->last_start_min;
4651                         STRLEN l;
4652                         const char * const s = SvPV_const(data->last_found, l);
4653                         SSize_t old = b - data->last_start_min;
4654
4655                         if (UTF)
4656                             old = utf8_hop((U8*)s, old) - (U8*)s;
4657                         l -= old;
4658                         /* Get the added string: */
4659                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4660                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4661                                             (U8*)(s + old + l)) : l;
4662                         if (deltanext == 0 && pos_before == b) {
4663                             /* What was added is a constant string */
4664                             if (mincount > 1) {
4665
4666                                 SvGROW(last_str, (mincount * l) + 1);
4667                                 repeatcpy(SvPVX(last_str) + l,
4668                                           SvPVX_const(last_str), l,
4669                                           mincount - 1);
4670                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4671                                 /* Add additional parts. */
4672                                 SvCUR_set(data->last_found,
4673                                           SvCUR(data->last_found) - l);
4674                                 sv_catsv(data->last_found, last_str);
4675                                 {
4676                                     SV * sv = data->last_found;
4677                                     MAGIC *mg =
4678                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4679                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4680                                     if (mg && mg->mg_len >= 0)
4681                                         mg->mg_len += last_chrs * (mincount-1);
4682                                 }
4683                                 last_chrs *= mincount;
4684                                 data->last_end += l * (mincount - 1);
4685                             }
4686                         } else {
4687                             /* start offset must point into the last copy */
4688                             data->last_start_min += minnext * (mincount - 1);
4689                             data->last_start_max += is_inf ? SSize_t_MAX
4690                                 : (maxcount - 1) * (minnext + data->pos_delta);
4691                         }
4692                     }
4693                     /* It is counted once already... */
4694                     data->pos_min += minnext * (mincount - counted);
4695 #if 0
4696 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4697                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4698                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4699     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4700     (UV)mincount);
4701 if (deltanext != SSize_t_MAX)
4702 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4703     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4704           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4705 #endif
4706                     if (deltanext == SSize_t_MAX
4707                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4708                         data->pos_delta = SSize_t_MAX;
4709                     else
4710                         data->pos_delta += - counted * deltanext +
4711                         (minnext + deltanext) * maxcount - minnext * mincount;
4712                     if (mincount != maxcount) {
4713                          /* Cannot extend fixed substrings found inside
4714                             the group.  */
4715                         scan_commit(pRExC_state, data, minlenp, is_inf);
4716                         if (mincount && last_str) {
4717                             SV * const sv = data->last_found;
4718                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4719                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4720
4721                             if (mg)
4722                                 mg->mg_len = -1;
4723                             sv_setsv(sv, last_str);
4724                             data->last_end = data->pos_min;
4725                             data->last_start_min = data->pos_min - last_chrs;
4726                             data->last_start_max = is_inf
4727                                 ? SSize_t_MAX
4728                                 : data->pos_min + data->pos_delta - last_chrs;
4729                         }
4730                         data->longest = &(data->longest_float);
4731                     }
4732                     SvREFCNT_dec(last_str);
4733                 }
4734                 if (data && (fl & SF_HAS_EVAL))
4735                     data->flags |= SF_HAS_EVAL;
4736               optimize_curly_tail:
4737                 if (OP(oscan) != CURLYX) {
4738                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4739                            && NEXT_OFF(next))
4740                         NEXT_OFF(oscan) += NEXT_OFF(next);
4741                 }
4742                 continue;
4743
4744             default:
4745 #ifdef DEBUGGING
4746                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4747                                                                     OP(scan));
4748 #endif
4749             case REF:
4750             case CLUMP:
4751                 if (flags & SCF_DO_SUBSTR) {
4752                     /* Cannot expect anything... */
4753                     scan_commit(pRExC_state, data, minlenp, is_inf);
4754                     data->longest = &(data->longest_float);
4755                 }
4756                 is_inf = is_inf_internal = 1;
4757                 if (flags & SCF_DO_STCLASS_OR) {
4758                     if (OP(scan) == CLUMP) {
4759                         /* Actually is any start char, but very few code points
4760                          * aren't start characters */
4761                         ssc_match_all_cp(data->start_class);
4762                     }
4763                     else {
4764                         ssc_anything(data->start_class);
4765                     }
4766                 }
4767                 flags &= ~SCF_DO_STCLASS;
4768                 break;
4769             }
4770         }
4771         else if (OP(scan) == LNBREAK) {
4772             if (flags & SCF_DO_STCLASS) {
4773                 if (flags & SCF_DO_STCLASS_AND) {
4774                     ssc_intersection(data->start_class,
4775                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4776                     ssc_clear_locale(data->start_class);
4777                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4778                 }
4779                 else if (flags & SCF_DO_STCLASS_OR) {
4780                     ssc_union(data->start_class,
4781                               PL_XPosix_ptrs[_CC_VERTSPACE],
4782                               FALSE);
4783                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4784
4785                     /* See commit msg for
4786                      * 749e076fceedeb708a624933726e7989f2302f6a */
4787                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4788                 }
4789                 flags &= ~SCF_DO_STCLASS;
4790             }
4791             min++;
4792             delta++;    /* Because of the 2 char string cr-lf */
4793             if (flags & SCF_DO_SUBSTR) {
4794                 /* Cannot expect anything... */
4795                 scan_commit(pRExC_state, data, minlenp, is_inf);
4796                 data->pos_min += 1;
4797                 data->pos_delta += 1;
4798                 data->longest = &(data->longest_float);
4799             }
4800         }
4801         else if (REGNODE_SIMPLE(OP(scan))) {
4802
4803             if (flags & SCF_DO_SUBSTR) {
4804                 scan_commit(pRExC_state, data, minlenp, is_inf);
4805                 data->pos_min++;
4806             }
4807             min++;
4808             if (flags & SCF_DO_STCLASS) {
4809                 bool invert = 0;
4810                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4811                 U8 namedclass;
4812
4813                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4814                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4815
4816                 /* Some of the logic below assumes that switching
4817                    locale on will only add false positives. */
4818                 switch (OP(scan)) {
4819
4820                 default:
4821 #ifdef DEBUGGING
4822                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4823                                                                      OP(scan));
4824 #endif
4825                 case CANY:
4826                 case SANY:
4827                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4828                         ssc_match_all_cp(data->start_class);
4829                     break;
4830
4831                 case REG_ANY:
4832                     {
4833                         SV* REG_ANY_invlist = _new_invlist(2);
4834                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4835                                                             '\n');
4836                         if (flags & SCF_DO_STCLASS_OR) {
4837                             ssc_union(data->start_class,
4838                                       REG_ANY_invlist,
4839                                       TRUE /* TRUE => invert, hence all but \n
4840                                             */
4841                                       );
4842                         }
4843                         else if (flags & SCF_DO_STCLASS_AND) {
4844                             ssc_intersection(data->start_class,
4845                                              REG_ANY_invlist,
4846                                              TRUE  /* TRUE => invert */
4847                                              );
4848                             ssc_clear_locale(data->start_class);
4849                         }
4850                         SvREFCNT_dec_NN(REG_ANY_invlist);
4851                     }
4852                     break;
4853
4854                 case ANYOF:
4855                     if (flags & SCF_DO_STCLASS_AND)
4856                         ssc_and(pRExC_state, data->start_class,
4857                                 (regnode_charclass *) scan);
4858                     else
4859                         ssc_or(pRExC_state, data->start_class,
4860                                                           (regnode_charclass *) scan);
4861                     break;
4862
4863                 case NPOSIXL:
4864                     invert = 1;
4865                     /* FALLTHROUGH */
4866
4867                 case POSIXL:
4868                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4869                     if (flags & SCF_DO_STCLASS_AND) {
4870                         bool was_there = cBOOL(
4871                                           ANYOF_POSIXL_TEST(data->start_class,
4872                                                                  namedclass));
4873                         ANYOF_POSIXL_ZERO(data->start_class);
4874                         if (was_there) {    /* Do an AND */
4875                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4876                         }
4877                         /* No individual code points can now match */
4878                         data->start_class->invlist
4879                                                 = sv_2mortal(_new_invlist(0));
4880                     }
4881                     else {
4882                         int complement = namedclass + ((invert) ? -1 : 1);
4883
4884                         assert(flags & SCF_DO_STCLASS_OR);
4885
4886                         /* If the complement of this class was already there,
4887                          * the result is that they match all code points,
4888                          * (\d + \D == everything).  Remove the classes from
4889                          * future consideration.  Locale is not relevant in
4890                          * this case */
4891                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4892                             ssc_match_all_cp(data->start_class);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4894                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4895                         }
4896                         else {  /* The usual case; just add this class to the
4897                                    existing set */
4898                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4899                         }
4900                     }
4901                     break;
4902
4903                 case NPOSIXA:   /* For these, we always know the exact set of
4904                                    what's matched */
4905                     invert = 1;
4906                     /* FALLTHROUGH */
4907                 case POSIXA:
4908                     if (FLAGS(scan) == _CC_ASCII) {
4909                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4910                     }
4911                     else {
4912                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4913                                               PL_XPosix_ptrs[_CC_ASCII],
4914                                               &my_invlist);
4915                     }
4916                     goto join_posix;
4917
4918                 case NPOSIXD:
4919                 case NPOSIXU:
4920                     invert = 1;
4921                     /* FALLTHROUGH */
4922                 case POSIXD:
4923                 case POSIXU:
4924                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4925
4926                     /* NPOSIXD matches all upper Latin1 code points unless the
4927                      * target string being matched is UTF-8, which is
4928                      * unknowable until match time.  Since we are going to
4929                      * invert, we want to get rid of all of them so that the
4930                      * inversion will match all */
4931                     if (OP(scan) == NPOSIXD) {
4932                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4933                                           &my_invlist);
4934                     }
4935
4936                   join_posix:
4937
4938                     if (flags & SCF_DO_STCLASS_AND) {
4939                         ssc_intersection(data->start_class, my_invlist, invert);
4940                         ssc_clear_locale(data->start_class);
4941                     }
4942                     else {
4943                         assert(flags & SCF_DO_STCLASS_OR);
4944                         ssc_union(data->start_class, my_invlist, invert);
4945                     }
4946                 }
4947                 if (flags & SCF_DO_STCLASS_OR)
4948                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4949                 flags &= ~SCF_DO_STCLASS;
4950             }
4951         }
4952         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4953             data->flags |= (OP(scan) == MEOL
4954                             ? SF_BEFORE_MEOL
4955                             : SF_BEFORE_SEOL);
4956             scan_commit(pRExC_state, data, minlenp, is_inf);
4957
4958         }
4959         else if (  PL_regkind[OP(scan)] == BRANCHJ
4960                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4961                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4962                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4963             if ( OP(scan) == UNLESSM &&
4964                  scan->flags == 0 &&
4965                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4966                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4967             ) {
4968                 regnode *opt;
4969                 regnode *upto= regnext(scan);
4970                 DEBUG_PARSE_r({
4971                     SV * const mysv_val=sv_newmortal();
4972                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4973
4974                     /*DEBUG_PARSE_MSG("opfail");*/
4975                     regprop(RExC_rx, mysv_val, upto, NULL);
4976                     PerlIO_printf(Perl_debug_log,
4977                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4978                         SvPV_nolen_const(mysv_val),
4979                         (IV)REG_NODE_NUM(upto),
4980                         (IV)(upto - scan)
4981                     );
4982                 });
4983                 OP(scan) = OPFAIL;
4984                 NEXT_OFF(scan) = upto - scan;
4985                 for (opt= scan + 1; opt < upto ; opt++)
4986                     OP(opt) = OPTIMIZED;
4987                 scan= upto;
4988                 continue;
4989             }
4990             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4991                 || OP(scan) == UNLESSM )
4992             {
4993                 /* Negative Lookahead/lookbehind
4994                    In this case we can't do fixed string optimisation.
4995                 */
4996
4997                 SSize_t deltanext, minnext, fake = 0;
4998                 regnode *nscan;
4999                 regnode_ssc intrnl;
5000                 int f = 0;
5001
5002                 data_fake.flags = 0;
5003                 if (data) {
5004                     data_fake.whilem_c = data->whilem_c;
5005                     data_fake.last_closep = data->last_closep;
5006                 }
5007                 else
5008                     data_fake.last_closep = &fake;
5009                 data_fake.pos_delta = delta;
5010                 if ( flags & SCF_DO_STCLASS && !scan->flags
5011                      && OP(scan) == IFMATCH ) { /* Lookahead */
5012                     ssc_init(pRExC_state, &intrnl);
5013                     data_fake.start_class = &intrnl;
5014                     f |= SCF_DO_STCLASS_AND;
5015                 }
5016                 if (flags & SCF_WHILEM_VISITED_POS)
5017                     f |= SCF_WHILEM_VISITED_POS;
5018                 next = regnext(scan);
5019                 nscan = NEXTOPER(NEXTOPER(scan));
5020                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5021                                       last, &data_fake, stopparen,
5022                                       recursed_depth, NULL, f, depth+1);
5023                 if (scan->flags) {
5024                     if (deltanext) {
5025                         FAIL("Variable length lookbehind not implemented");
5026                     }
5027                     else if (minnext > (I32)U8_MAX) {
5028                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5029                               (UV)U8_MAX);
5030                     }
5031                     scan->flags = (U8)minnext;
5032                 }
5033                 if (data) {
5034                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5035                         pars++;
5036                     if (data_fake.flags & SF_HAS_EVAL)
5037                         data->flags |= SF_HAS_EVAL;
5038                     data->whilem_c = data_fake.whilem_c;
5039                 }
5040                 if (f & SCF_DO_STCLASS_AND) {
5041                     if (flags & SCF_DO_STCLASS_OR) {
5042                         /* OR before, AND after: ideally we would recurse with
5043                          * data_fake to get the AND applied by study of the
5044                          * remainder of the pattern, and then derecurse;
5045                          * *** HACK *** for now just treat as "no information".
5046                          * See [perl #56690].
5047                          */
5048                         ssc_init(pRExC_state, data->start_class);
5049                     }  else {
5050                         /* AND before and after: combine and continue */
5051                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5052                     }
5053                 }
5054             }
5055 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5056             else {
5057                 /* Positive Lookahead/lookbehind
5058                    In this case we can do fixed string optimisation,
5059                    but we must be careful about it. Note in the case of
5060                    lookbehind the positions will be offset by the minimum
5061                    length of the pattern, something we won't know about
5062                    until after the recurse.
5063                 */
5064                 SSize_t deltanext, fake = 0;
5065                 regnode *nscan;
5066                 regnode_ssc intrnl;
5067                 int f = 0;
5068                 /* We use SAVEFREEPV so that when the full compile
5069                     is finished perl will clean up the allocated
5070                     minlens when it's all done. This way we don't
5071                     have to worry about freeing them when we know
5072                     they wont be used, which would be a pain.
5073                  */
5074                 SSize_t *minnextp;
5075                 Newx( minnextp, 1, SSize_t );
5076                 SAVEFREEPV(minnextp);
5077
5078                 if (data) {
5079                     StructCopy(data, &data_fake, scan_data_t);
5080                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5081                         f |= SCF_DO_SUBSTR;
5082                         if (scan->flags)
5083                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5084                         data_fake.last_found=newSVsv(data->last_found);
5085                     }
5086                 }
5087                 else
5088                     data_fake.last_closep = &fake;
5089                 data_fake.flags = 0;
5090                 data_fake.pos_delta = delta;
5091                 if (is_inf)
5092                     data_fake.flags |= SF_IS_INF;
5093                 if ( flags & SCF_DO_STCLASS && !scan->flags
5094                      && OP(scan) == IFMATCH ) { /* Lookahead */
5095                     ssc_init(pRExC_state, &intrnl);
5096                     data_fake.start_class = &intrnl;
5097                     f |= SCF_DO_STCLASS_AND;
5098                 }
5099                 if (flags & SCF_WHILEM_VISITED_POS)
5100                     f |= SCF_WHILEM_VISITED_POS;
5101                 next = regnext(scan);
5102                 nscan = NEXTOPER(NEXTOPER(scan));
5103
5104                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5105                                         &deltanext, last, &data_fake,
5106                                         stopparen, recursed_depth, NULL,
5107                                         f,depth+1);
5108                 if (scan->flags) {
5109                     if (deltanext) {
5110                         FAIL("Variable length lookbehind not implemented");
5111                     }
5112                     else if (*minnextp > (I32)U8_MAX) {
5113                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5114                               (UV)U8_MAX);
5115                     }
5116                     scan->flags = (U8)*minnextp;
5117                 }
5118
5119                 *minnextp += min;
5120
5121                 if (f & SCF_DO_STCLASS_AND) {
5122                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5123                 }
5124                 if (data) {
5125                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5126                         pars++;
5127                     if (data_fake.flags & SF_HAS_EVAL)
5128                         data->flags |= SF_HAS_EVAL;
5129                     data->whilem_c = data_fake.whilem_c;
5130                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5131                         if (RExC_rx->minlen<*minnextp)
5132                             RExC_rx->minlen=*minnextp;
5133                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5134                         SvREFCNT_dec_NN(data_fake.last_found);
5135
5136                         if ( data_fake.minlen_fixed != minlenp )
5137                         {
5138                             data->offset_fixed= data_fake.offset_fixed;
5139                             data->minlen_fixed= data_fake.minlen_fixed;
5140                             data->lookbehind_fixed+= scan->flags;
5141                         }
5142                         if ( data_fake.minlen_float != minlenp )
5143                         {
5144                             data->minlen_float= data_fake.minlen_float;
5145                             data->offset_float_min=data_fake.offset_float_min;
5146                             data->offset_float_max=data_fake.offset_float_max;
5147                             data->lookbehind_float+= scan->flags;
5148                         }
5149                     }
5150                 }
5151             }
5152 #endif
5153         }
5154         else if (OP(scan) == OPEN) {
5155             if (stopparen != (I32)ARG(scan))
5156                 pars++;
5157         }
5158         else if (OP(scan) == CLOSE) {
5159             if (stopparen == (I32)ARG(scan)) {
5160                 break;
5161             }
5162             if ((I32)ARG(scan) == is_par) {
5163                 next = regnext(scan);
5164
5165                 if ( next && (OP(next) != WHILEM) && next < last)
5166                     is_par = 0;         /* Disable optimization */
5167             }
5168             if (data)
5169                 *(data->last_closep) = ARG(scan);
5170         }
5171         else if (OP(scan) == EVAL) {
5172                 if (data)
5173                     data->flags |= SF_HAS_EVAL;
5174         }
5175         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5176             if (flags & SCF_DO_SUBSTR) {
5177                 scan_commit(pRExC_state, data, minlenp, is_inf);
5178                 flags &= ~SCF_DO_SUBSTR;
5179             }
5180             if (data && OP(scan)==ACCEPT) {
5181                 data->flags |= SCF_SEEN_ACCEPT;
5182                 if (stopmin > min)
5183                     stopmin = min;
5184             }
5185         }
5186         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5187         {
5188                 if (flags & SCF_DO_SUBSTR) {
5189                     scan_commit(pRExC_state, data, minlenp, is_inf);
5190                     data->longest = &(data->longest_float);
5191                 }
5192                 is_inf = is_inf_internal = 1;
5193                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5194                     ssc_anything(data->start_class);
5195                 flags &= ~SCF_DO_STCLASS;
5196         }
5197         else if (OP(scan) == GPOS) {
5198             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5199                 !(delta || is_inf || (data && data->pos_delta)))
5200             {
5201                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5202                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5203                 if (RExC_rx->gofs < (STRLEN)min)
5204                     RExC_rx->gofs = min;
5205             } else {
5206                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5207                 RExC_rx->gofs = 0;
5208             }
5209         }
5210 #ifdef TRIE_STUDY_OPT
5211 #ifdef FULL_TRIE_STUDY
5212         else if (PL_regkind[OP(scan)] == TRIE) {
5213             /* NOTE - There is similar code to this block above for handling
5214                BRANCH nodes on the initial study.  If you change stuff here
5215                check there too. */
5216             regnode *trie_node= scan;
5217             regnode *tail= regnext(scan);
5218             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5219             SSize_t max1 = 0, min1 = SSize_t_MAX;
5220             regnode_ssc accum;
5221
5222             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5223                 /* Cannot merge strings after this. */
5224                 scan_commit(pRExC_state, data, minlenp, is_inf);
5225             }
5226             if (flags & SCF_DO_STCLASS)
5227                 ssc_init_zero(pRExC_state, &accum);
5228
5229             if (!trie->jump) {
5230                 min1= trie->minlen;
5231                 max1= trie->maxlen;
5232             } else {
5233                 const regnode *nextbranch= NULL;
5234                 U32 word;
5235
5236                 for ( word=1 ; word <= trie->wordcount ; word++)
5237                 {
5238                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5239                     regnode_ssc this_class;
5240
5241                     data_fake.flags = 0;
5242                     if (data) {
5243                         data_fake.whilem_c = data->whilem_c;
5244                         data_fake.last_closep = data->last_closep;
5245                     }
5246                     else
5247                         data_fake.last_closep = &fake;
5248                     data_fake.pos_delta = delta;
5249                     if (flags & SCF_DO_STCLASS) {
5250                         ssc_init(pRExC_state, &this_class);
5251                         data_fake.start_class = &this_class;
5252                         f = SCF_DO_STCLASS_AND;
5253                     }
5254                     if (flags & SCF_WHILEM_VISITED_POS)
5255                         f |= SCF_WHILEM_VISITED_POS;
5256
5257                     if (trie->jump[word]) {
5258                         if (!nextbranch)
5259                             nextbranch = trie_node + trie->jump[0];
5260                         scan= trie_node + trie->jump[word];
5261                         /* We go from the jump point to the branch that follows
5262                            it. Note this means we need the vestigal unused
5263                            branches even though they arent otherwise used. */
5264                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5265                             &deltanext, (regnode *)nextbranch, &data_fake,
5266                             stopparen, recursed_depth, NULL, f,depth+1);
5267                     }
5268                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5269                         nextbranch= regnext((regnode*)nextbranch);
5270
5271                     if (min1 > (SSize_t)(minnext + trie->minlen))
5272                         min1 = minnext + trie->minlen;
5273                     if (deltanext == SSize_t_MAX) {
5274                         is_inf = is_inf_internal = 1;
5275                         max1 = SSize_t_MAX;
5276                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5277                         max1 = minnext + deltanext + trie->maxlen;
5278
5279                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5280                         pars++;
5281                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5282                         if ( stopmin > min + min1)
5283                             stopmin = min + min1;
5284                         flags &= ~SCF_DO_SUBSTR;
5285                         if (data)
5286                             data->flags |= SCF_SEEN_ACCEPT;
5287                     }
5288                     if (data) {
5289                         if (data_fake.flags & SF_HAS_EVAL)
5290                             data->flags |= SF_HAS_EVAL;
5291                         data->whilem_c = data_fake.whilem_c;
5292                     }
5293                     if (flags & SCF_DO_STCLASS)
5294                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5295                 }
5296             }
5297             if (flags & SCF_DO_SUBSTR) {
5298                 data->pos_min += min1;
5299                 data->pos_delta += max1 - min1;
5300                 if (max1 != min1 || is_inf)
5301                     data->longest = &(data->longest_float);
5302             }
5303             min += min1;
5304             delta += max1 - min1;
5305             if (flags & SCF_DO_STCLASS_OR) {
5306                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5307                 if (min1) {
5308                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5309                     flags &= ~SCF_DO_STCLASS;
5310                 }
5311             }
5312             else if (flags & SCF_DO_STCLASS_AND) {
5313                 if (min1) {
5314                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5315                     flags &= ~SCF_DO_STCLASS;
5316                 }
5317                 else {
5318                     /* Switch to OR mode: cache the old value of
5319                      * data->start_class */
5320                     INIT_AND_WITHP;
5321                     StructCopy(data->start_class, and_withp, regnode_ssc);
5322                     flags &= ~SCF_DO_STCLASS_AND;
5323                     StructCopy(&accum, data->start_class, regnode_ssc);
5324                     flags |= SCF_DO_STCLASS_OR;
5325                 }
5326             }
5327             scan= tail;
5328             continue;
5329         }
5330 #else
5331         else if (PL_regkind[OP(scan)] == TRIE) {
5332             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5333             U8*bang=NULL;
5334
5335             min += trie->minlen;
5336             delta += (trie->maxlen - trie->minlen);
5337             flags &= ~SCF_DO_STCLASS; /* xxx */
5338             if (flags & SCF_DO_SUBSTR) {
5339                 /* Cannot expect anything... */
5340                 scan_commit(pRExC_state, data, minlenp, is_inf);
5341                 data->pos_min += trie->minlen;
5342                 data->pos_delta += (trie->maxlen - trie->minlen);
5343                 if (trie->maxlen != trie->minlen)
5344                     data->longest = &(data->longest_float);
5345             }
5346             if (trie->jump) /* no more substrings -- for now /grr*/
5347                flags &= ~SCF_DO_SUBSTR;
5348         }
5349 #endif /* old or new */
5350 #endif /* TRIE_STUDY_OPT */
5351
5352         /* Else: zero-length, ignore. */
5353         scan = regnext(scan);
5354     }
5355     /* If we are exiting a recursion we can unset its recursed bit
5356      * and allow ourselves to enter it again - no danger of an
5357      * infinite loop there.
5358     if (stopparen > -1 && recursed) {
5359         DEBUG_STUDYDATA("unset:", data,depth);
5360         PAREN_UNSET( recursed, stopparen);
5361     }
5362     */
5363     if (frame) {
5364         DEBUG_STUDYDATA("frame-end:",data,depth);
5365         DEBUG_PEEP("fend", scan, depth);
5366         /* restore previous context */
5367         last = frame->last;
5368         scan = frame->next;
5369         stopparen = frame->stop;
5370         recursed_depth = frame->prev_recursed_depth;
5371         depth = depth - 1;
5372
5373         frame = frame->prev;
5374         goto fake_study_recurse;
5375     }
5376
5377   finish:
5378     assert(!frame);
5379     DEBUG_STUDYDATA("pre-fin:",data,depth);
5380
5381     *scanp = scan;
5382     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5383
5384     if (flags & SCF_DO_SUBSTR && is_inf)
5385         data->pos_delta = SSize_t_MAX - data->pos_min;
5386     if (is_par > (I32)U8_MAX)
5387         is_par = 0;
5388     if (is_par && pars==1 && data) {
5389         data->flags |= SF_IN_PAR;
5390         data->flags &= ~SF_HAS_PAR;
5391     }
5392     else if (pars && data) {
5393         data->flags |= SF_HAS_PAR;
5394         data->flags &= ~SF_IN_PAR;
5395     }
5396     if (flags & SCF_DO_STCLASS_OR)
5397         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5398     if (flags & SCF_TRIE_RESTUDY)
5399         data->flags |=  SCF_TRIE_RESTUDY;
5400
5401     DEBUG_STUDYDATA("post-fin:",data,depth);
5402
5403     {
5404         SSize_t final_minlen= min < stopmin ? min : stopmin;
5405
5406         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5407             RExC_maxlen = final_minlen + delta;
5408         }
5409         return final_minlen;
5410     }
5411     /* not-reached */
5412 }
5413
5414 STATIC U32
5415 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5416 {
5417     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5418
5419     PERL_ARGS_ASSERT_ADD_DATA;
5420
5421     Renewc(RExC_rxi->data,
5422            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5423            char, struct reg_data);
5424     if(count)
5425         Renew(RExC_rxi->data->what, count + n, U8);
5426     else
5427         Newx(RExC_rxi->data->what, n, U8);
5428     RExC_rxi->data->count = count + n;
5429     Copy(s, RExC_rxi->data->what + count, n, U8);
5430     return count;
5431 }
5432
5433 /*XXX: todo make this not included in a non debugging perl */
5434 #ifndef PERL_IN_XSUB_RE
5435 void
5436 Perl_reginitcolors(pTHX)
5437 {
5438     dVAR;
5439     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5440     if (s) {
5441         char *t = savepv(s);
5442         int i = 0;
5443         PL_colors[0] = t;
5444         while (++i < 6) {
5445             t = strchr(t, '\t');
5446             if (t) {
5447                 *t = '\0';
5448                 PL_colors[i] = ++t;
5449             }
5450             else
5451                 PL_colors[i] = t = (char *)"";
5452         }
5453     } else {
5454         int i = 0;
5455         while (i < 6)
5456             PL_colors[i++] = (char *)"";
5457     }
5458     PL_colorset = 1;
5459 }
5460 #endif
5461
5462
5463 #ifdef TRIE_STUDY_OPT
5464 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5465     STMT_START {                                            \
5466         if (                                                \
5467               (data.flags & SCF_TRIE_RESTUDY)               \
5468               && ! restudied++                              \
5469         ) {                                                 \
5470             dOsomething;                                    \
5471             goto reStudy;                                   \
5472         }                                                   \
5473     } STMT_END
5474 #else
5475 #define CHECK_RESTUDY_GOTO_butfirst
5476 #endif
5477
5478 /*
5479  * pregcomp - compile a regular expression into internal code
5480  *
5481  * Decides which engine's compiler to call based on the hint currently in
5482  * scope
5483  */
5484
5485 #ifndef PERL_IN_XSUB_RE
5486
5487 /* return the currently in-scope regex engine (or the default if none)  */
5488
5489 regexp_engine const *
5490 Perl_current_re_engine(pTHX)
5491 {
5492     dVAR;
5493
5494     if (IN_PERL_COMPILETIME) {
5495         HV * const table = GvHV(PL_hintgv);
5496         SV **ptr;
5497
5498         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5499             return &PL_core_reg_engine;
5500         ptr = hv_fetchs(table, "regcomp", FALSE);
5501         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5502             return &PL_core_reg_engine;
5503         return INT2PTR(regexp_engine*,SvIV(*ptr));
5504     }
5505     else {
5506         SV *ptr;
5507         if (!PL_curcop->cop_hints_hash)
5508             return &PL_core_reg_engine;
5509         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5510         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5511             return &PL_core_reg_engine;
5512         return INT2PTR(regexp_engine*,SvIV(ptr));
5513     }
5514 }
5515
5516
5517 REGEXP *
5518 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5519 {
5520     dVAR;
5521     regexp_engine const *eng = current_re_engine();
5522     GET_RE_DEBUG_FLAGS_DECL;
5523
5524     PERL_ARGS_ASSERT_PREGCOMP;
5525
5526     /* Dispatch a request to compile a regexp to correct regexp engine. */
5527     DEBUG_COMPILE_r({
5528         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5529                         PTR2UV(eng));
5530     });
5531     return CALLREGCOMP_ENG(eng, pattern, flags);
5532 }
5533 #endif
5534
5535 /* public(ish) entry point for the perl core's own regex compiling code.
5536  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5537  * pattern rather than a list of OPs, and uses the internal engine rather
5538  * than the current one */
5539
5540 REGEXP *
5541 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5542 {
5543     SV *pat = pattern; /* defeat constness! */
5544     PERL_ARGS_ASSERT_RE_COMPILE;
5545     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5546 #ifdef PERL_IN_XSUB_RE
5547                                 &my_reg_engine,
5548 #else
5549                                 &PL_core_reg_engine,
5550 #endif
5551                                 NULL, NULL, rx_flags, 0);
5552 }
5553
5554
5555 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5556  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5557  * point to the realloced string and length.
5558  *
5559  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5560  * stuff added */
5561
5562 static void
5563 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5564                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5565 {
5566     U8 *const src = (U8*)*pat_p;
5567     U8 *dst;
5568     int n=0;
5569     STRLEN s = 0, d = 0;
5570     bool do_end = 0;
5571     GET_RE_DEBUG_FLAGS_DECL;
5572
5573     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5574         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5575
5576     Newx(dst, *plen_p * 2 + 1, U8);
5577
5578     while (s < *plen_p) {
5579         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5580             dst[d]   = src[s];
5581         else {
5582             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5583             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5584         }
5585         if (n < num_code_blocks) {
5586             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5587                 pRExC_state->code_blocks[n].start = d;
5588                 assert(dst[d] == '(');
5589                 do_end = 1;
5590             }
5591             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5592                 pRExC_state->code_blocks[n].end = d;
5593                 assert(dst[d] == ')');
5594                 do_end = 0;
5595                 n++;
5596             }
5597         }
5598         s++;
5599         d++;
5600     }
5601     dst[d] = '\0';
5602     *plen_p = d;
5603     *pat_p = (char*) dst;
5604     SAVEFREEPV(*pat_p);
5605     RExC_orig_utf8 = RExC_utf8 = 1;
5606 }
5607
5608
5609
5610 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5611  * while recording any code block indices, and handling overloading,
5612  * nested qr// objects etc.  If pat is null, it will allocate a new
5613  * string, or just return the first arg, if there's only one.
5614  *
5615  * Returns the malloced/updated pat.
5616  * patternp and pat_count is the array of SVs to be concatted;
5617  * oplist is the optional list of ops that generated the SVs;
5618  * recompile_p is a pointer to a boolean that will be set if
5619  *   the regex will need to be recompiled.
5620  * delim, if non-null is an SV that will be inserted between each element
5621  */
5622
5623 static SV*
5624 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5625                 SV *pat, SV ** const patternp, int pat_count,
5626                 OP *oplist, bool *recompile_p, SV *delim)
5627 {
5628     SV **svp;
5629     int n = 0;
5630     bool use_delim = FALSE;
5631     bool alloced = FALSE;
5632
5633     /* if we know we have at least two args, create an empty string,
5634      * then concatenate args to that. For no args, return an empty string */
5635     if (!pat && pat_count != 1) {
5636         pat = newSVpvn("", 0);
5637         SAVEFREESV(pat);
5638         alloced = TRUE;
5639     }
5640
5641     for (svp = patternp; svp < patternp + pat_count; svp++) {
5642         SV *sv;
5643         SV *rx  = NULL;
5644         STRLEN orig_patlen = 0;
5645         bool code = 0;
5646         SV *msv = use_delim ? delim : *svp;
5647         if (!msv) msv = &PL_sv_undef;
5648
5649         /* if we've got a delimiter, we go round the loop twice for each
5650          * svp slot (except the last), using the delimiter the second
5651          * time round */
5652         if (use_delim) {
5653             svp--;
5654             use_delim = FALSE;
5655         }
5656         else if (delim)
5657             use_delim = TRUE;
5658
5659         if (SvTYPE(msv) == SVt_PVAV) {
5660             /* we've encountered an interpolated array within
5661              * the pattern, e.g. /...@a..../. Expand the list of elements,
5662              * then recursively append elements.
5663              * The code in this block is based on S_pushav() */
5664
5665             AV *const av = (AV*)msv;
5666             const SSize_t maxarg = AvFILL(av) + 1;
5667             SV **array;
5668
5669             if (oplist) {
5670                 assert(oplist->op_type == OP_PADAV
5671                     || oplist->op_type == OP_RV2AV);
5672                 oplist = oplist->op_sibling;;
5673             }
5674
5675             if (SvRMAGICAL(av)) {
5676                 SSize_t i;
5677
5678                 Newx(array, maxarg, SV*);
5679                 SAVEFREEPV(array);
5680                 for (i=0; i < maxarg; i++) {
5681                     SV ** const svp = av_fetch(av, i, FALSE);
5682                     array[i] = svp ? *svp : &PL_sv_undef;
5683                 }
5684             }
5685             else
5686                 array = AvARRAY(av);
5687
5688             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5689                                 array, maxarg, NULL, recompile_p,
5690                                 /* $" */
5691                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5692
5693             continue;
5694         }
5695
5696
5697         /* we make the assumption here that each op in the list of
5698          * op_siblings maps to one SV pushed onto the stack,
5699          * except for code blocks, with have both an OP_NULL and
5700          * and OP_CONST.
5701          * This allows us to match up the list of SVs against the
5702          * list of OPs to find the next code block.
5703          *
5704          * Note that       PUSHMARK PADSV PADSV ..
5705          * is optimised to
5706          *                 PADRANGE PADSV  PADSV  ..
5707          * so the alignment still works. */
5708
5709         if (oplist) {
5710             if (oplist->op_type == OP_NULL
5711                 && (oplist->op_flags & OPf_SPECIAL))
5712             {
5713                 assert(n < pRExC_state->num_code_blocks);
5714                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5715                 pRExC_state->code_blocks[n].block = oplist;
5716                 pRExC_state->code_blocks[n].src_regex = NULL;
5717                 n++;
5718                 code = 1;
5719                 oplist = oplist->op_sibling; /* skip CONST */
5720                 assert(oplist);
5721             }
5722             oplist = oplist->op_sibling;;
5723         }
5724
5725         /* apply magic and QR overloading to arg */
5726
5727         SvGETMAGIC(msv);
5728         if (SvROK(msv) && SvAMAGIC(msv)) {
5729             SV *sv = AMG_CALLunary(msv, regexp_amg);
5730             if (sv) {
5731                 if (SvROK(sv))
5732                     sv = SvRV(sv);
5733                 if (SvTYPE(sv) != SVt_REGEXP)
5734                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5735                 msv = sv;
5736             }
5737         }
5738
5739         /* try concatenation overload ... */
5740         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5741                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5742         {
5743             sv_setsv(pat, sv);
5744             /* overloading involved: all bets are off over literal
5745              * code. Pretend we haven't seen it */
5746             pRExC_state->num_code_blocks -= n;
5747             n = 0;
5748         }
5749         else  {
5750             /* ... or failing that, try "" overload */
5751             while (SvAMAGIC(msv)
5752                     && (sv = AMG_CALLunary(msv, string_amg))
5753                     && sv != msv
5754                     &&  !(   SvROK(msv)
5755                           && SvROK(sv)
5756                           && SvRV(msv) == SvRV(sv))
5757             ) {
5758                 msv = sv;
5759                 SvGETMAGIC(msv);
5760             }
5761             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5762                 msv = SvRV(msv);
5763
5764             if (pat) {
5765                 /* this is a partially unrolled
5766                  *     sv_catsv_nomg(pat, msv);
5767                  * that allows us to adjust code block indices if
5768                  * needed */
5769                 STRLEN dlen;
5770                 char *dst = SvPV_force_nomg(pat, dlen);
5771                 orig_patlen = dlen;
5772                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5773                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5774                     sv_setpvn(pat, dst, dlen);
5775                     SvUTF8_on(pat);
5776                 }
5777                 sv_catsv_nomg(pat, msv);
5778                 rx = msv;
5779             }
5780             else
5781                 pat = msv;
5782
5783             if (code)
5784                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5785         }
5786
5787         /* extract any code blocks within any embedded qr//'s */
5788         if (rx && SvTYPE(rx) == SVt_REGEXP
5789             && RX_ENGINE((REGEXP*)rx)->op_comp)
5790         {
5791
5792             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5793             if (ri->num_code_blocks) {
5794                 int i;
5795                 /* the presence of an embedded qr// with code means
5796                  * we should always recompile: the text of the
5797                  * qr// may not have changed, but it may be a
5798                  * different closure than last time */
5799                 *recompile_p = 1;
5800                 Renew(pRExC_state->code_blocks,
5801                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5802                     struct reg_code_block);
5803                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5804
5805                 for (i=0; i < ri->num_code_blocks; i++) {
5806                     struct reg_code_block *src, *dst;
5807                     STRLEN offset =  orig_patlen
5808                         + ReANY((REGEXP *)rx)->pre_prefix;
5809                     assert(n < pRExC_state->num_code_blocks);
5810                     src = &ri->code_blocks[i];
5811                     dst = &pRExC_state->code_blocks[n];
5812                     dst->start      = src->start + offset;
5813                     dst->end        = src->end   + offset;
5814                     dst->block      = src->block;
5815                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5816                                             src->src_regex
5817                                                 ? src->src_regex
5818                                                 : (REGEXP*)rx);
5819                     n++;
5820                 }
5821             }
5822         }
5823     }
5824     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5825     if (alloced)
5826         SvSETMAGIC(pat);
5827
5828     return pat;
5829 }
5830
5831
5832
5833 /* see if there are any run-time code blocks in the pattern.
5834  * False positives are allowed */
5835
5836 static bool
5837 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5838                     char *pat, STRLEN plen)
5839 {
5840     int n = 0;
5841     STRLEN s;
5842
5843     for (s = 0; s < plen; s++) {
5844         if (n < pRExC_state->num_code_blocks
5845             && s == pRExC_state->code_blocks[n].start)
5846         {
5847             s = pRExC_state->code_blocks[n].end;
5848             n++;
5849             continue;
5850         }
5851         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5852          * positives here */
5853         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5854             (pat[s+2] == '{'
5855                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5856         )
5857             return 1;
5858     }
5859     return 0;
5860 }
5861
5862 /* Handle run-time code blocks. We will already have compiled any direct
5863  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5864  * copy of it, but with any literal code blocks blanked out and
5865  * appropriate chars escaped; then feed it into
5866  *
5867  *    eval "qr'modified_pattern'"
5868  *
5869  * For example,
5870  *
5871  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5872  *
5873  * becomes
5874  *
5875  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5876  *
5877  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5878  * and merge them with any code blocks of the original regexp.
5879  *
5880  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5881  * instead, just save the qr and return FALSE; this tells our caller that
5882  * the original pattern needs upgrading to utf8.
5883  */
5884
5885 static bool
5886 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5887     char *pat, STRLEN plen)
5888 {
5889     SV *qr;
5890
5891     GET_RE_DEBUG_FLAGS_DECL;
5892
5893     if (pRExC_state->runtime_code_qr) {
5894         /* this is the second time we've been called; this should
5895          * only happen if the main pattern got upgraded to utf8
5896          * during compilation; re-use the qr we compiled first time
5897          * round (which should be utf8 too)
5898          */
5899         qr = pRExC_state->runtime_code_qr;
5900         pRExC_state->runtime_code_qr = NULL;
5901         assert(RExC_utf8 && SvUTF8(qr));
5902     }
5903     else {
5904         int n = 0;
5905         STRLEN s;
5906         char *p, *newpat;
5907         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5908         SV *sv, *qr_ref;
5909         dSP;
5910
5911         /* determine how many extra chars we need for ' and \ escaping */
5912         for (s = 0; s < plen; s++) {
5913             if (pat[s] == '\'' || pat[s] == '\\')
5914                 newlen++;
5915         }
5916
5917         Newx(newpat, newlen, char);
5918         p = newpat;
5919         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5920
5921         for (s = 0; s < plen; s++) {
5922             if (n < pRExC_state->num_code_blocks
5923                 && s == pRExC_state->code_blocks[n].start)
5924             {
5925                 /* blank out literal code block */
5926                 assert(pat[s] == '(');
5927                 while (s <= pRExC_state->code_blocks[n].end) {
5928                     *p++ = '_';
5929                     s++;
5930                 }
5931                 s--;
5932                 n++;
5933                 continue;
5934             }
5935             if (pat[s] == '\'' || pat[s] == '\\')
5936                 *p++ = '\\';
5937             *p++ = pat[s];
5938         }
5939         *p++ = '\'';
5940         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5941             *p++ = 'x';
5942         *p++ = '\0';
5943         DEBUG_COMPILE_r({
5944             PerlIO_printf(Perl_debug_log,
5945                 "%sre-parsing pattern for runtime code:%s %s\n",
5946                 PL_colors[4],PL_colors[5],newpat);
5947         });
5948
5949         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5950         Safefree(newpat);
5951
5952         ENTER;
5953         SAVETMPS;
5954         save_re_context();
5955         PUSHSTACKi(PERLSI_REQUIRE);
5956         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5957          * parsing qr''; normally only q'' does this. It also alters
5958          * hints handling */
5959         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5960         SvREFCNT_dec_NN(sv);
5961         SPAGAIN;
5962         qr_ref = POPs;
5963         PUTBACK;
5964         {
5965             SV * const errsv = ERRSV;
5966             if (SvTRUE_NN(errsv))
5967             {
5968                 Safefree(pRExC_state->code_blocks);
5969                 /* use croak_sv ? */
5970                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5971             }
5972         }
5973         assert(SvROK(qr_ref));
5974         qr = SvRV(qr_ref);
5975         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5976         /* the leaving below frees the tmp qr_ref.
5977          * Give qr a life of its own */
5978         SvREFCNT_inc(qr);
5979         POPSTACK;
5980         FREETMPS;
5981         LEAVE;
5982
5983     }
5984
5985     if (!RExC_utf8 && SvUTF8(qr)) {
5986         /* first time through; the pattern got upgraded; save the
5987          * qr for the next time through */
5988         assert(!pRExC_state->runtime_code_qr);
5989         pRExC_state->runtime_code_qr = qr;
5990         return 0;
5991     }
5992
5993
5994     /* extract any code blocks within the returned qr//  */
5995
5996
5997     /* merge the main (r1) and run-time (r2) code blocks into one */
5998     {
5999         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6000         struct reg_code_block *new_block, *dst;
6001         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6002         int i1 = 0, i2 = 0;
6003
6004         if (!r2->num_code_blocks) /* we guessed wrong */
6005         {
6006             SvREFCNT_dec_NN(qr);
6007             return 1;
6008         }
6009
6010         Newx(new_block,
6011             r1->num_code_blocks + r2->num_code_blocks,
6012             struct reg_code_block);
6013         dst = new_block;
6014
6015         while (    i1 < r1->num_code_blocks
6016                 || i2 < r2->num_code_blocks)
6017         {
6018             struct reg_code_block *src;
6019             bool is_qr = 0;
6020
6021             if (i1 == r1->num_code_blocks) {
6022                 src = &r2->code_blocks[i2++];
6023                 is_qr = 1;
6024             }
6025             else if (i2 == r2->num_code_blocks)
6026                 src = &r1->code_blocks[i1++];
6027             else if (  r1->code_blocks[i1].start
6028                      < r2->code_blocks[i2].start)
6029             {
6030                 src = &r1->code_blocks[i1++];
6031                 assert(src->end < r2->code_blocks[i2].start);
6032             }
6033             else {
6034                 assert(  r1->code_blocks[i1].start
6035                        > r2->code_blocks[i2].start);
6036                 src = &r2->code_blocks[i2++];
6037                 is_qr = 1;
6038                 assert(src->end < r1->code_blocks[i1].start);
6039             }
6040
6041             assert(pat[src->start] == '(');
6042             assert(pat[src->end]   == ')');
6043             dst->start      = src->start;
6044             dst->end        = src->end;
6045             dst->block      = src->block;
6046             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6047                                     : src->src_regex;
6048             dst++;
6049         }
6050         r1->num_code_blocks += r2->num_code_blocks;
6051         Safefree(r1->code_blocks);
6052         r1->code_blocks = new_block;
6053     }
6054
6055     SvREFCNT_dec_NN(qr);
6056     return 1;
6057 }
6058
6059
6060 STATIC bool
6061 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6062                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6063                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6064                       STRLEN longest_length, bool eol, bool meol)
6065 {
6066     /* This is the common code for setting up the floating and fixed length
6067      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6068      * as to whether succeeded or not */
6069
6070     I32 t;
6071     SSize_t ml;
6072
6073     if (! (longest_length
6074            || (eol /* Can't have SEOL and MULTI */
6075                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6076           )
6077             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6078         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6079     {
6080         return FALSE;
6081     }
6082
6083     /* copy the information about the longest from the reg_scan_data
6084         over to the program. */
6085     if (SvUTF8(sv_longest)) {
6086         *rx_utf8 = sv_longest;
6087         *rx_substr = NULL;
6088     } else {
6089         *rx_substr = sv_longest;
6090         *rx_utf8 = NULL;
6091     }
6092     /* end_shift is how many chars that must be matched that
6093         follow this item. We calculate it ahead of time as once the
6094         lookbehind offset is added in we lose the ability to correctly
6095         calculate it.*/
6096     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6097     *rx_end_shift = ml - offset
6098         - longest_length + (SvTAIL(sv_longest) != 0)
6099         + lookbehind;
6100
6101     t = (eol/* Can't have SEOL and MULTI */
6102          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6103     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6104
6105     return TRUE;
6106 }
6107
6108 /*
6109  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6110  * regular expression into internal code.
6111  * The pattern may be passed either as:
6112  *    a list of SVs (patternp plus pat_count)
6113  *    a list of OPs (expr)
6114  * If both are passed, the SV list is used, but the OP list indicates
6115  * which SVs are actually pre-compiled code blocks
6116  *
6117  * The SVs in the list have magic and qr overloading applied to them (and
6118  * the list may be modified in-place with replacement SVs in the latter
6119  * case).
6120  *
6121  * If the pattern hasn't changed from old_re, then old_re will be
6122  * returned.
6123  *
6124  * eng is the current engine. If that engine has an op_comp method, then
6125  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6126  * do the initial concatenation of arguments and pass on to the external
6127  * engine.
6128  *
6129  * If is_bare_re is not null, set it to a boolean indicating whether the
6130  * arg list reduced (after overloading) to a single bare regex which has
6131  * been returned (i.e. /$qr/).
6132  *
6133  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6134  *
6135  * pm_flags contains the PMf_* flags, typically based on those from the
6136  * pm_flags field of the related PMOP. Currently we're only interested in
6137  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6138  *
6139  * We can't allocate space until we know how big the compiled form will be,
6140  * but we can't compile it (and thus know how big it is) until we've got a
6141  * place to put the code.  So we cheat:  we compile it twice, once with code
6142  * generation turned off and size counting turned on, and once "for real".
6143  * This also means that we don't allocate space until we are sure that the
6144  * thing really will compile successfully, and we never have to move the
6145  * code and thus invalidate pointers into it.  (Note that it has to be in
6146  * one piece because free() must be able to free it all.) [NB: not true in perl]
6147  *
6148  * Beware that the optimization-preparation code in here knows about some
6149  * of the structure of the compiled regexp.  [I'll say.]
6150  */
6151
6152 REGEXP *
6153 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6154                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6155                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6156 {
6157     dVAR;
6158     REGEXP *rx;
6159     struct regexp *r;
6160     regexp_internal *ri;
6161     STRLEN plen;
6162     char *exp;
6163     regnode *scan;
6164     I32 flags;
6165     SSize_t minlen = 0;
6166     U32 rx_flags;
6167     SV *pat;
6168     SV *code_blocksv = NULL;
6169     SV** new_patternp = patternp;
6170
6171     /* these are all flags - maybe they should be turned
6172      * into a single int with different bit masks */
6173     I32 sawlookahead = 0;
6174     I32 sawplus = 0;
6175     I32 sawopen = 0;
6176     I32 sawminmod = 0;
6177
6178     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6179     bool recompile = 0;
6180     bool runtime_code = 0;
6181     scan_data_t data;
6182     RExC_state_t RExC_state;
6183     RExC_state_t * const pRExC_state = &RExC_state;
6184 #ifdef TRIE_STUDY_OPT
6185     int restudied = 0;
6186     RExC_state_t copyRExC_state;
6187 #endif
6188     GET_RE_DEBUG_FLAGS_DECL;
6189
6190     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6191
6192     DEBUG_r(if (!PL_colorset) reginitcolors());
6193
6194 #ifndef PERL_IN_XSUB_RE
6195     /* Initialize these here instead of as-needed, as is quick and avoids
6196      * having to test them each time otherwise */
6197     if (! PL_AboveLatin1) {
6198         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6199         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6200         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6201         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6202         PL_HasMultiCharFold =
6203                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6204     }
6205 #endif
6206
6207     pRExC_state->code_blocks = NULL;
6208     pRExC_state->num_code_blocks = 0;
6209
6210     if (is_bare_re)
6211         *is_bare_re = FALSE;
6212
6213     if (expr && (expr->op_type == OP_LIST ||
6214                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6215         /* allocate code_blocks if needed */
6216         OP *o;
6217         int ncode = 0;
6218
6219         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6220             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6221                 ncode++; /* count of DO blocks */
6222         if (ncode) {
6223             pRExC_state->num_code_blocks = ncode;
6224             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6225         }
6226     }
6227
6228     if (!pat_count) {
6229         /* compile-time pattern with just OP_CONSTs and DO blocks */
6230
6231         int n;
6232         OP *o;
6233
6234         /* find how many CONSTs there are */
6235         assert(expr);
6236         n = 0;
6237         if (expr->op_type == OP_CONST)
6238             n = 1;
6239         else
6240             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6241                 if (o->op_type == OP_CONST)
6242                     n++;
6243             }
6244
6245         /* fake up an SV array */
6246
6247         assert(!new_patternp);
6248         Newx(new_patternp, n, SV*);
6249         SAVEFREEPV(new_patternp);
6250         pat_count = n;
6251
6252         n = 0;
6253         if (expr->op_type == OP_CONST)
6254             new_patternp[n] = cSVOPx_sv(expr);
6255         else
6256             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6257                 if (o->op_type == OP_CONST)
6258                     new_patternp[n++] = cSVOPo_sv;
6259             }
6260
6261     }
6262
6263     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6264         "Assembling pattern from %d elements%s\n", pat_count,
6265             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6266
6267     /* set expr to the first arg op */
6268
6269     if (pRExC_state->num_code_blocks
6270          && expr->op_type != OP_CONST)
6271     {
6272             expr = cLISTOPx(expr)->op_first;
6273             assert(   expr->op_type == OP_PUSHMARK
6274                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6275                    || expr->op_type == OP_PADRANGE);
6276             expr = expr->op_sibling;
6277     }
6278
6279     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6280                         expr, &recompile, NULL);
6281
6282     /* handle bare (possibly after overloading) regex: foo =~ $re */
6283     {
6284         SV *re = pat;
6285         if (SvROK(re))
6286             re = SvRV(re);
6287         if (SvTYPE(re) == SVt_REGEXP) {
6288             if (is_bare_re)
6289                 *is_bare_re = TRUE;
6290             SvREFCNT_inc(re);
6291             Safefree(pRExC_state->code_blocks);
6292             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6293                 "Precompiled pattern%s\n",
6294                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6295
6296             return (REGEXP*)re;
6297         }
6298     }
6299
6300     exp = SvPV_nomg(pat, plen);
6301
6302     if (!eng->op_comp) {
6303         if ((SvUTF8(pat) && IN_BYTES)
6304                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6305         {
6306             /* make a temporary copy; either to convert to bytes,
6307              * or to avoid repeating get-magic / overloaded stringify */
6308             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6309                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6310         }
6311         Safefree(pRExC_state->code_blocks);
6312         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6313     }
6314
6315     /* ignore the utf8ness if the pattern is 0 length */
6316     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6317     RExC_uni_semantics = 0;
6318     RExC_contains_locale = 0;
6319     RExC_contains_i = 0;
6320     pRExC_state->runtime_code_qr = NULL;
6321
6322     DEBUG_COMPILE_r({
6323             SV *dsv= sv_newmortal();
6324             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6325             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6326                           PL_colors[4],PL_colors[5],s);
6327         });
6328
6329   redo_first_pass:
6330     /* we jump here if we upgrade the pattern to utf8 and have to
6331      * recompile */
6332
6333     if ((pm_flags & PMf_USE_RE_EVAL)
6334                 /* this second condition covers the non-regex literal case,
6335                  * i.e.  $foo =~ '(?{})'. */
6336                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6337     )
6338         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6339
6340     /* return old regex if pattern hasn't changed */
6341     /* XXX: note in the below we have to check the flags as well as the
6342      * pattern.
6343      *
6344      * Things get a touch tricky as we have to compare the utf8 flag
6345      * independently from the compile flags.  */
6346
6347     if (   old_re
6348         && !recompile
6349         && !!RX_UTF8(old_re) == !!RExC_utf8
6350         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6351         && RX_PRECOMP(old_re)
6352         && RX_PRELEN(old_re) == plen
6353         && memEQ(RX_PRECOMP(old_re), exp, plen)
6354         && !runtime_code /* with runtime code, always recompile */ )
6355     {
6356         Safefree(pRExC_state->code_blocks);
6357         return old_re;
6358     }
6359
6360     rx_flags = orig_rx_flags;
6361
6362     if (rx_flags & PMf_FOLD) {
6363         RExC_contains_i = 1;
6364     }
6365     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6366
6367         /* Set to use unicode semantics if the pattern is in utf8 and has the
6368          * 'depends' charset specified, as it means unicode when utf8  */
6369         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6370     }
6371
6372     RExC_precomp = exp;
6373     RExC_flags = rx_flags;
6374     RExC_pm_flags = pm_flags;
6375
6376     if (runtime_code) {
6377         if (TAINTING_get && TAINT_get)
6378             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6379
6380         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6381             /* whoops, we have a non-utf8 pattern, whilst run-time code
6382              * got compiled as utf8. Try again with a utf8 pattern */
6383             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6384                                     pRExC_state->num_code_blocks);
6385             goto redo_first_pass;
6386         }
6387     }
6388     assert(!pRExC_state->runtime_code_qr);
6389
6390     RExC_sawback = 0;
6391
6392     RExC_seen = 0;
6393     RExC_maxlen = 0;
6394     RExC_in_lookbehind = 0;
6395     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6396     RExC_extralen = 0;
6397     RExC_override_recoding = 0;
6398     RExC_in_multi_char_class = 0;
6399
6400     /* First pass: determine size, legality. */
6401     RExC_parse = exp;
6402     RExC_start = exp;
6403     RExC_end = exp + plen;
6404     RExC_naughty = 0;
6405     RExC_npar = 1;
6406     RExC_nestroot = 0;
6407     RExC_size = 0L;
6408     RExC_emit = (regnode *) &RExC_emit_dummy;
6409     RExC_whilem_seen = 0;
6410     RExC_open_parens = NULL;
6411     RExC_close_parens = NULL;
6412     RExC_opend = NULL;
6413     RExC_paren_names = NULL;
6414 #ifdef DEBUGGING
6415     RExC_paren_name_list = NULL;
6416 #endif
6417     RExC_recurse = NULL;
6418     RExC_study_chunk_recursed = NULL;
6419     RExC_study_chunk_recursed_bytes= 0;
6420     RExC_recurse_count = 0;
6421     pRExC_state->code_index = 0;
6422
6423 #if 0 /* REGC() is (currently) a NOP at the first pass.
6424        * Clever compilers notice this and complain. --jhi */
6425     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6426 #endif
6427     DEBUG_PARSE_r(
6428         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6429         RExC_lastnum=0;
6430         RExC_lastparse=NULL;
6431     );
6432     /* reg may croak on us, not giving us a chance to free
6433        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6434        need it to survive as long as the regexp (qr/(?{})/).
6435        We must check that code_blocksv is not already set, because we may
6436        have jumped back to restart the sizing pass. */
6437     if (pRExC_state->code_blocks && !code_blocksv) {
6438         code_blocksv = newSV_type(SVt_PV);
6439         SAVEFREESV(code_blocksv);
6440         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6441         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6442     }
6443     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6444         /* It's possible to write a regexp in ascii that represents Unicode
6445         codepoints outside of the byte range, such as via \x{100}. If we
6446         detect such a sequence we have to convert the entire pattern to utf8
6447         and then recompile, as our sizing calculation will have been based
6448         on 1 byte == 1 character, but we will need to use utf8 to encode
6449         at least some part of the pattern, and therefore must convert the whole
6450         thing.
6451         -- dmq */
6452         if (flags & RESTART_UTF8) {
6453             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6454                                     pRExC_state->num_code_blocks);
6455             goto redo_first_pass;
6456         }
6457         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6458     }
6459     if (code_blocksv)
6460         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6461
6462     DEBUG_PARSE_r({
6463         PerlIO_printf(Perl_debug_log,
6464             "Required size %"IVdf" nodes\n"
6465             "Starting second pass (creation)\n",
6466             (IV)RExC_size);
6467         RExC_lastnum=0;
6468         RExC_lastparse=NULL;
6469     });
6470
6471     /* The first pass could have found things that force Unicode semantics */
6472     if ((RExC_utf8 || RExC_uni_semantics)
6473          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6474     {
6475         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6476     }
6477
6478     /* Small enough for pointer-storage convention?
6479        If extralen==0, this means that we will not need long jumps. */
6480     if (RExC_size >= 0x10000L && RExC_extralen)
6481         RExC_size += RExC_extralen;
6482     else
6483         RExC_extralen = 0;
6484     if (RExC_whilem_seen > 15)
6485         RExC_whilem_seen = 15;
6486
6487     /* Allocate space and zero-initialize. Note, the two step process
6488        of zeroing when in debug mode, thus anything assigned has to
6489        happen after that */
6490     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6491     r = ReANY(rx);
6492     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6493          char, regexp_internal);
6494     if ( r == NULL || ri == NULL )
6495         FAIL("Regexp out of space");
6496 #ifdef DEBUGGING
6497     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6498     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6499          char);
6500 #else
6501     /* bulk initialize base fields with 0. */
6502     Zero(ri, sizeof(regexp_internal), char);
6503 #endif
6504
6505     /* non-zero initialization begins here */
6506     RXi_SET( r, ri );
6507     r->engine= eng;
6508     r->extflags = rx_flags;
6509     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6510
6511     if (pm_flags & PMf_IS_QR) {
6512         ri->code_blocks = pRExC_state->code_blocks;
6513         ri->num_code_blocks = pRExC_state->num_code_blocks;
6514     }
6515     else
6516     {
6517         int n;
6518         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6519             if (pRExC_state->code_blocks[n].src_regex)
6520                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6521         SAVEFREEPV(pRExC_state->code_blocks);
6522     }
6523
6524     {
6525         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6526         bool has_charset = (get_regex_charset(r->extflags)
6527                                                     != REGEX_DEPENDS_CHARSET);
6528
6529         /* The caret is output if there are any defaults: if not all the STD
6530          * flags are set, or if no character set specifier is needed */
6531         bool has_default =
6532                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6533                     || ! has_charset);
6534         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6535                                                    == REG_RUN_ON_COMMENT_SEEN);
6536         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6537                             >> RXf_PMf_STD_PMMOD_SHIFT);
6538         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6539         char *p;
6540         /* Allocate for the worst case, which is all the std flags are turned
6541          * on.  If more precision is desired, we could do a population count of
6542          * the flags set.  This could be done with a small lookup table, or by
6543          * shifting, masking and adding, or even, when available, assembly
6544          * language for a machine-language population count.
6545          * We never output a minus, as all those are defaults, so are
6546          * covered by the caret */
6547         const STRLEN wraplen = plen + has_p + has_runon
6548             + has_default       /* If needs a caret */
6549
6550                 /* If needs a character set specifier */
6551             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6552             + (sizeof(STD_PAT_MODS) - 1)
6553             + (sizeof("(?:)") - 1);
6554
6555         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6556         r->xpv_len_u.xpvlenu_pv = p;
6557         if (RExC_utf8)
6558             SvFLAGS(rx) |= SVf_UTF8;
6559         *p++='('; *p++='?';
6560
6561         /* If a default, cover it using the caret */
6562         if (has_default) {
6563             *p++= DEFAULT_PAT_MOD;
6564         }
6565         if (has_charset) {
6566             STRLEN len;
6567             const char* const name = get_regex_charset_name(r->extflags, &len);
6568             Copy(name, p, len, char);
6569             p += len;
6570         }
6571         if (has_p)
6572             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6573         {
6574             char ch;
6575             while((ch = *fptr++)) {
6576                 if(reganch & 1)
6577                     *p++ = ch;
6578                 reganch >>= 1;
6579             }
6580         }
6581
6582         *p++ = ':';
6583         Copy(RExC_precomp, p, plen, char);
6584         assert ((RX_WRAPPED(rx) - p) < 16);
6585         r->pre_prefix = p - RX_WRAPPED(rx);
6586         p += plen;
6587         if (has_runon)
6588             *p++ = '\n';
6589         *p++ = ')';
6590         *p = 0;
6591         SvCUR_set(rx, p - RX_WRAPPED(rx));
6592     }
6593
6594     r->intflags = 0;
6595     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6596
6597     /* setup various meta data about recursion, this all requires
6598      * RExC_npar to be correctly set, and a bit later on we clear it */
6599     if (RExC_seen & REG_RECURSE_SEEN) {
6600         Newxz(RExC_open_parens, RExC_npar,regnode *);
6601         SAVEFREEPV(RExC_open_parens);
6602         Newxz(RExC_close_parens,RExC_npar,regnode *);
6603         SAVEFREEPV(RExC_close_parens);
6604     }
6605     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6606         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6607          * So its 1 if there are no parens. */
6608         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6609                                          ((RExC_npar & 0x07) != 0);
6610         Newx(RExC_study_chunk_recursed,
6611              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6612         SAVEFREEPV(RExC_study_chunk_recursed);
6613     }
6614
6615     /* Useful during FAIL. */
6616 #ifdef RE_TRACK_PATTERN_OFFSETS
6617     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6618     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6619                           "%s %"UVuf" bytes for offset annotations.\n",
6620                           ri->u.offsets ? "Got" : "Couldn't get",
6621                           (UV)((2*RExC_size+1) * sizeof(U32))));
6622 #endif
6623     SetProgLen(ri,RExC_size);
6624     RExC_rx_sv = rx;
6625     RExC_rx = r;
6626     RExC_rxi = ri;
6627
6628     /* Second pass: emit code. */
6629     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6630     RExC_pm_flags = pm_flags;
6631     RExC_parse = exp;
6632     RExC_end = exp + plen;
6633     RExC_naughty = 0;
6634     RExC_npar = 1;
6635     RExC_emit_start = ri->program;
6636     RExC_emit = ri->program;
6637     RExC_emit_bound = ri->program + RExC_size + 1;
6638     pRExC_state->code_index = 0;
6639
6640     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6641     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6642         ReREFCNT_dec(rx);
6643         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6644     }
6645     /* XXXX To minimize changes to RE engine we always allocate
6646        3-units-long substrs field. */
6647     Newx(r->substrs, 1, struct reg_substr_data);
6648     if (RExC_recurse_count) {
6649         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6650         SAVEFREEPV(RExC_recurse);
6651     }
6652
6653 reStudy:
6654     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6655     Zero(r->substrs, 1, struct reg_substr_data);
6656     if (RExC_study_chunk_recursed)
6657         Zero(RExC_study_chunk_recursed,
6658              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6659
6660 #ifdef TRIE_STUDY_OPT
6661     if (!restudied) {
6662         StructCopy(&zero_scan_data, &data, scan_data_t);
6663         copyRExC_state = RExC_state;
6664     } else {
6665         U32 seen=RExC_seen;
6666         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6667
6668         RExC_state = copyRExC_state;
6669         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6670             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6671         else
6672             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6673         StructCopy(&zero_scan_data, &data, scan_data_t);
6674     }
6675 #else
6676     StructCopy(&zero_scan_data, &data, scan_data_t);
6677 #endif
6678
6679     /* Dig out information for optimizations. */
6680     r->extflags = RExC_flags; /* was pm_op */
6681     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6682
6683     if (UTF)
6684         SvUTF8_on(rx);  /* Unicode in it? */
6685     ri->regstclass = NULL;
6686     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6687         r->intflags |= PREGf_NAUGHTY;
6688     scan = ri->program + 1;             /* First BRANCH. */
6689
6690     /* testing for BRANCH here tells us whether there is "must appear"
6691        data in the pattern. If there is then we can use it for optimisations */
6692     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6693                                                   */
6694         SSize_t fake;
6695         STRLEN longest_float_length, longest_fixed_length;
6696         regnode_ssc ch_class; /* pointed to by data */
6697         int stclass_flag;
6698         SSize_t last_close = 0; /* pointed to by data */
6699         regnode *first= scan;
6700         regnode *first_next= regnext(first);
6701         /*
6702          * Skip introductions and multiplicators >= 1
6703          * so that we can extract the 'meat' of the pattern that must
6704          * match in the large if() sequence following.
6705          * NOTE that EXACT is NOT covered here, as it is normally
6706          * picked up by the optimiser separately.
6707          *
6708          * This is unfortunate as the optimiser isnt handling lookahead
6709          * properly currently.
6710          *
6711          */
6712         while ((OP(first) == OPEN && (sawopen = 1)) ||
6713                /* An OR of *one* alternative - should not happen now. */
6714             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6715             /* for now we can't handle lookbehind IFMATCH*/
6716             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6717             (OP(first) == PLUS) ||
6718             (OP(first) == MINMOD) ||
6719                /* An {n,m} with n>0 */
6720             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6721             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6722         {
6723                 /*
6724                  * the only op that could be a regnode is PLUS, all the rest
6725                  * will be regnode_1 or regnode_2.
6726                  *
6727                  * (yves doesn't think this is true)
6728                  */
6729                 if (OP(first) == PLUS)
6730                     sawplus = 1;
6731                 else {
6732                     if (OP(first) == MINMOD)
6733                         sawminmod = 1;
6734                     first += regarglen[OP(first)];
6735                 }
6736                 first = NEXTOPER(first);
6737                 first_next= regnext(first);
6738         }
6739
6740         /* Starting-point info. */
6741       again:
6742         DEBUG_PEEP("first:",first,0);
6743         /* Ignore EXACT as we deal with it later. */
6744         if (PL_regkind[OP(first)] == EXACT) {
6745             if (OP(first) == EXACT)
6746                 NOOP;   /* Empty, get anchored substr later. */
6747             else
6748                 ri->regstclass = first;
6749         }
6750 #ifdef TRIE_STCLASS
6751         else if (PL_regkind[OP(first)] == TRIE &&
6752                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6753         {
6754             regnode *trie_op;
6755             /* this can happen only on restudy */
6756             if ( OP(first) == TRIE ) {
6757                 struct regnode_1 *trieop = (struct regnode_1 *)
6758                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6759                 StructCopy(first,trieop,struct regnode_1);
6760                 trie_op=(regnode *)trieop;
6761             } else {
6762                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6763                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6764                 StructCopy(first,trieop,struct regnode_charclass);
6765                 trie_op=(regnode *)trieop;
6766             }
6767             OP(trie_op)+=2;
6768             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6769             ri->regstclass = trie_op;
6770         }
6771 #endif
6772         else if (REGNODE_SIMPLE(OP(first)))
6773             ri->regstclass = first;
6774         else if (PL_regkind[OP(first)] == BOUND ||
6775                  PL_regkind[OP(first)] == NBOUND)
6776             ri->regstclass = first;
6777         else if (PL_regkind[OP(first)] == BOL) {
6778             r->intflags |= (OP(first) == MBOL
6779                            ? PREGf_ANCH_MBOL
6780                            : (OP(first) == SBOL
6781                               ? PREGf_ANCH_SBOL
6782                               : PREGf_ANCH_BOL));
6783             first = NEXTOPER(first);
6784             goto again;
6785         }
6786         else if (OP(first) == GPOS) {
6787             r->intflags |= PREGf_ANCH_GPOS;
6788             first = NEXTOPER(first);
6789             goto again;
6790         }
6791         else if ((!sawopen || !RExC_sawback) &&
6792             (OP(first) == STAR &&
6793             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6794             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6795         {
6796             /* turn .* into ^.* with an implied $*=1 */
6797             const int type =
6798                 (OP(NEXTOPER(first)) == REG_ANY)
6799                     ? PREGf_ANCH_MBOL
6800                     : PREGf_ANCH_SBOL;
6801             r->intflags |= (type | PREGf_IMPLICIT);
6802             first = NEXTOPER(first);
6803             goto again;
6804         }
6805         if (sawplus && !sawminmod && !sawlookahead
6806             && (!sawopen || !RExC_sawback)
6807             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6808             /* x+ must match at the 1st pos of run of x's */
6809             r->intflags |= PREGf_SKIP;
6810
6811         /* Scan is after the zeroth branch, first is atomic matcher. */
6812 #ifdef TRIE_STUDY_OPT
6813         DEBUG_PARSE_r(
6814             if (!restudied)
6815                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6816                               (IV)(first - scan + 1))
6817         );
6818 #else
6819         DEBUG_PARSE_r(
6820             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6821                 (IV)(first - scan + 1))
6822         );
6823 #endif
6824
6825
6826         /*
6827         * If there's something expensive in the r.e., find the
6828         * longest literal string that must appear and make it the
6829         * regmust.  Resolve ties in favor of later strings, since
6830         * the regstart check works with the beginning of the r.e.
6831         * and avoiding duplication strengthens checking.  Not a
6832         * strong reason, but sufficient in the absence of others.
6833         * [Now we resolve ties in favor of the earlier string if
6834         * it happens that c_offset_min has been invalidated, since the
6835         * earlier string may buy us something the later one won't.]
6836         */
6837
6838         data.longest_fixed = newSVpvs("");
6839         data.longest_float = newSVpvs("");
6840         data.last_found = newSVpvs("");
6841         data.longest = &(data.longest_fixed);
6842         ENTER_with_name("study_chunk");
6843         SAVEFREESV(data.longest_fixed);
6844         SAVEFREESV(data.longest_float);
6845         SAVEFREESV(data.last_found);
6846         first = scan;
6847         if (!ri->regstclass) {
6848             ssc_init(pRExC_state, &ch_class);
6849             data.start_class = &ch_class;
6850             stclass_flag = SCF_DO_STCLASS_AND;
6851         } else                          /* XXXX Check for BOUND? */
6852             stclass_flag = 0;
6853         data.last_closep = &last_close;
6854
6855         DEBUG_RExC_seen();
6856         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6857                              scan + RExC_size, /* Up to end */
6858             &data, -1, 0, NULL,
6859             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6860                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6861             0);
6862
6863
6864         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6865
6866
6867         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6868              && data.last_start_min == 0 && data.last_end > 0
6869              && !RExC_seen_zerolen
6870              && !(RExC_seen & REG_VERBARG_SEEN)
6871              && !(RExC_seen & REG_GPOS_SEEN)
6872         ){
6873             r->extflags |= RXf_CHECK_ALL;
6874         }
6875         scan_commit(pRExC_state, &data,&minlen,0);
6876
6877         longest_float_length = CHR_SVLEN(data.longest_float);
6878
6879         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6880                    && data.offset_fixed == data.offset_float_min
6881                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6882             && S_setup_longest (aTHX_ pRExC_state,
6883                                     data.longest_float,
6884                                     &(r->float_utf8),
6885                                     &(r->float_substr),
6886                                     &(r->float_end_shift),
6887                                     data.lookbehind_float,
6888                                     data.offset_float_min,
6889                                     data.minlen_float,
6890                                     longest_float_length,
6891                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6892                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6893         {
6894             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6895             r->float_max_offset = data.offset_float_max;
6896             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6897                 r->float_max_offset -= data.lookbehind_float;
6898             SvREFCNT_inc_simple_void_NN(data.longest_float);
6899         }
6900         else {
6901             r->float_substr = r->float_utf8 = NULL;
6902             longest_float_length = 0;
6903         }
6904
6905         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6906
6907         if (S_setup_longest (aTHX_ pRExC_state,
6908                                 data.longest_fixed,
6909                                 &(r->anchored_utf8),
6910                                 &(r->anchored_substr),
6911                                 &(r->anchored_end_shift),
6912                                 data.lookbehind_fixed,
6913                                 data.offset_fixed,
6914                                 data.minlen_fixed,
6915                                 longest_fixed_length,
6916                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6917                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6918         {
6919             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6920             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6921         }
6922         else {
6923             r->anchored_substr = r->anchored_utf8 = NULL;
6924             longest_fixed_length = 0;
6925         }
6926         LEAVE_with_name("study_chunk");
6927
6928         if (ri->regstclass
6929             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6930             ri->regstclass = NULL;
6931
6932         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6933             && stclass_flag
6934             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6935             && !ssc_is_anything(data.start_class))
6936         {
6937             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6938
6939             ssc_finalize(pRExC_state, data.start_class);
6940
6941             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6942             StructCopy(data.start_class,
6943                        (regnode_ssc*)RExC_rxi->data->data[n],
6944                        regnode_ssc);
6945             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6946             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6947             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6948                       regprop(r, sv, (regnode*)data.start_class, NULL);
6949                       PerlIO_printf(Perl_debug_log,
6950                                     "synthetic stclass \"%s\".\n",
6951                                     SvPVX_const(sv));});
6952             data.start_class = NULL;
6953         }
6954
6955         /* A temporary algorithm prefers floated substr to fixed one to dig
6956          * more info. */
6957         if (longest_fixed_length > longest_float_length) {
6958             r->substrs->check_ix = 0;
6959             r->check_end_shift = r->anchored_end_shift;
6960             r->check_substr = r->anchored_substr;
6961             r->check_utf8 = r->anchored_utf8;
6962             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6963             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6964                 r->intflags |= PREGf_NOSCAN;
6965         }
6966         else {
6967             r->substrs->check_ix = 1;
6968             r->check_end_shift = r->float_end_shift;
6969             r->check_substr = r->float_substr;
6970             r->check_utf8 = r->float_utf8;
6971             r->check_offset_min = r->float_min_offset;
6972             r->check_offset_max = r->float_max_offset;
6973         }
6974         if ((r->check_substr || r->check_utf8) ) {
6975             r->extflags |= RXf_USE_INTUIT;
6976             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6977                 r->extflags |= RXf_INTUIT_TAIL;
6978         }
6979         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6980
6981         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6982         if ( (STRLEN)minlen < longest_float_length )
6983             minlen= longest_float_length;
6984         if ( (STRLEN)minlen < longest_fixed_length )
6985             minlen= longest_fixed_length;
6986         */
6987     }
6988     else {
6989         /* Several toplevels. Best we can is to set minlen. */
6990         SSize_t fake;
6991         regnode_ssc ch_class;
6992         SSize_t last_close = 0;
6993
6994         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6995
6996         scan = ri->program + 1;
6997         ssc_init(pRExC_state, &ch_class);
6998         data.start_class = &ch_class;
6999         data.last_closep = &last_close;
7000
7001         DEBUG_RExC_seen();
7002         minlen = study_chunk(pRExC_state,
7003             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7004             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7005                                                       ? SCF_TRIE_DOING_RESTUDY
7006                                                       : 0),
7007             0);
7008
7009         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7010
7011         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7012                 = r->float_substr = r->float_utf8 = NULL;
7013
7014         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7015             && ! ssc_is_anything(data.start_class))
7016         {
7017             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7018
7019             ssc_finalize(pRExC_state, data.start_class);
7020
7021             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7022             StructCopy(data.start_class,
7023                        (regnode_ssc*)RExC_rxi->data->data[n],
7024                        regnode_ssc);
7025             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7026             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7027             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7028                       regprop(r, sv, (regnode*)data.start_class, NULL);
7029                       PerlIO_printf(Perl_debug_log,
7030                                     "synthetic stclass \"%s\".\n",
7031                                     SvPVX_const(sv));});
7032             data.start_class = NULL;
7033         }
7034     }
7035
7036     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7037         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7038         r->maxlen = REG_INFTY;
7039     }
7040     else {
7041         r->maxlen = RExC_maxlen;
7042     }
7043
7044     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7045        the "real" pattern. */
7046     DEBUG_OPTIMISE_r({
7047         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7048                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7049     });
7050     r->minlenret = minlen;
7051     if (r->minlen < minlen)
7052         r->minlen = minlen;
7053
7054     if (RExC_seen & REG_GPOS_SEEN)
7055         r->intflags |= PREGf_GPOS_SEEN;
7056     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7057         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7058                                                 lookbehind */
7059     if (pRExC_state->num_code_blocks)
7060         r->extflags |= RXf_EVAL_SEEN;
7061     if (RExC_seen & REG_CANY_SEEN)
7062         r->intflags |= PREGf_CANY_SEEN;
7063     if (RExC_seen & REG_VERBARG_SEEN)
7064     {
7065         r->intflags |= PREGf_VERBARG_SEEN;
7066         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7067     }
7068     if (RExC_seen & REG_CUTGROUP_SEEN)
7069         r->intflags |= PREGf_CUTGROUP_SEEN;
7070     if (pm_flags & PMf_USE_RE_EVAL)
7071         r->intflags |= PREGf_USE_RE_EVAL;
7072     if (RExC_paren_names)
7073         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7074     else
7075         RXp_PAREN_NAMES(r) = NULL;
7076
7077     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7078      * so it can be used in pp.c */
7079     if (r->intflags & PREGf_ANCH)
7080         r->extflags |= RXf_IS_ANCHORED;
7081
7082
7083     {
7084         /* this is used to identify "special" patterns that might result
7085          * in Perl NOT calling the regex engine and instead doing the match "itself",
7086          * particularly special cases in split//. By having the regex compiler
7087          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7088          * we avoid weird issues with equivalent patterns resulting in different behavior,
7089          * AND we allow non Perl engines to get the same optimizations by the setting the
7090          * flags appropriately - Yves */
7091         regnode *first = ri->program + 1;
7092         U8 fop = OP(first);
7093         regnode *next = NEXTOPER(first);
7094         U8 nop = OP(next);
7095
7096         if (PL_regkind[fop] == NOTHING && nop == END)
7097             r->extflags |= RXf_NULL;
7098         else if (PL_regkind[fop] == BOL && nop == END)
7099             r->extflags |= RXf_START_ONLY;
7100         else if (fop == PLUS
7101                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7102                  && OP(regnext(first)) == END)
7103             r->extflags |= RXf_WHITE;
7104         else if ( r->extflags & RXf_SPLIT
7105                   && fop == EXACT
7106                   && STR_LEN(first) == 1
7107                   && *(STRING(first)) == ' '
7108                   && OP(regnext(first)) == END )
7109             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7110
7111     }
7112
7113     if (RExC_contains_locale) {
7114         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7115     }
7116
7117 #ifdef DEBUGGING
7118     if (RExC_paren_names) {
7119         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7120         ri->data->data[ri->name_list_idx]
7121                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7122     } else
7123 #endif
7124         ri->name_list_idx = 0;
7125
7126     if (RExC_recurse_count) {
7127         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7128             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7129             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7130         }
7131     }
7132     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7133     /* assume we don't need to swap parens around before we match */
7134
7135     DEBUG_DUMP_r({
7136         DEBUG_RExC_seen();
7137         PerlIO_printf(Perl_debug_log,"Final program:\n");
7138         regdump(r);
7139     });
7140 #ifdef RE_TRACK_PATTERN_OFFSETS
7141     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7142         const STRLEN len = ri->u.offsets[0];
7143         STRLEN i;
7144         GET_RE_DEBUG_FLAGS_DECL;
7145         PerlIO_printf(Perl_debug_log,
7146                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7147         for (i = 1; i <= len; i++) {
7148             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7149                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7150                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7151             }
7152         PerlIO_printf(Perl_debug_log, "\n");
7153     });
7154 #endif
7155
7156 #ifdef USE_ITHREADS
7157     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7158      * by setting the regexp SV to readonly-only instead. If the
7159      * pattern's been recompiled, the USEDness should remain. */
7160     if (old_re && SvREADONLY(old_re))
7161         SvREADONLY_on(rx);
7162 #endif
7163     return rx;
7164 }
7165
7166
7167 SV*
7168 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7169                     const U32 flags)
7170 {
7171     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7172
7173     PERL_UNUSED_ARG(value);
7174
7175     if (flags & RXapif_FETCH) {
7176         return reg_named_buff_fetch(rx, key, flags);
7177     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7178         Perl_croak_no_modify();
7179         return NULL;
7180     } else if (flags & RXapif_EXISTS) {
7181         return reg_named_buff_exists(rx, key, flags)
7182             ? &PL_sv_yes
7183             : &PL_sv_no;
7184     } else if (flags & RXapif_REGNAMES) {
7185         return reg_named_buff_all(rx, flags);
7186     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7187         return reg_named_buff_scalar(rx, flags);
7188     } else {
7189         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7190         return NULL;
7191     }
7192 }
7193
7194 SV*
7195 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7196                          const U32 flags)
7197 {
7198     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7199     PERL_UNUSED_ARG(lastkey);
7200
7201     if (flags & RXapif_FIRSTKEY)
7202         return reg_named_buff_firstkey(rx, flags);
7203     else if (flags & RXapif_NEXTKEY)
7204         return reg_named_buff_nextkey(rx, flags);
7205     else {
7206         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7207                                             (int)flags);
7208         return NULL;
7209     }
7210 }
7211
7212 SV*
7213 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7214                           const U32 flags)
7215 {
7216     AV *retarray = NULL;
7217     SV *ret;
7218     struct regexp *const rx = ReANY(r);
7219
7220     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7221
7222     if (flags & RXapif_ALL)
7223         retarray=newAV();
7224
7225     if (rx && RXp_PAREN_NAMES(rx)) {
7226         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7227         if (he_str) {
7228             IV i;
7229             SV* sv_dat=HeVAL(he_str);
7230             I32 *nums=(I32*)SvPVX(sv_dat);
7231             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7232                 if ((I32)(rx->nparens) >= nums[i]
7233                     && rx->offs[nums[i]].start != -1
7234                     && rx->offs[nums[i]].end != -1)
7235                 {
7236                     ret = newSVpvs("");
7237                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7238                     if (!retarray)
7239                         return ret;
7240                 } else {
7241                     if (retarray)
7242                         ret = newSVsv(&PL_sv_undef);
7243                 }
7244                 if (retarray)
7245                     av_push(retarray, ret);
7246             }
7247             if (retarray)
7248                 return newRV_noinc(MUTABLE_SV(retarray));
7249         }
7250     }
7251     return NULL;
7252 }
7253
7254 bool
7255 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7256                            const U32 flags)
7257 {
7258     struct regexp *const rx = ReANY(r);
7259
7260     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7261
7262     if (rx && RXp_PAREN_NAMES(rx)) {
7263         if (flags & RXapif_ALL) {
7264             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7265         } else {
7266             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7267             if (sv) {
7268                 SvREFCNT_dec_NN(sv);
7269                 return TRUE;
7270             } else {
7271                 return FALSE;
7272             }
7273         }
7274     } else {
7275         return FALSE;
7276     }
7277 }
7278
7279 SV*
7280 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7281 {
7282     struct regexp *const rx = ReANY(r);
7283
7284     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7285
7286     if ( rx && RXp_PAREN_NAMES(rx) ) {
7287         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7288
7289         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7290     } else {
7291         return FALSE;
7292     }
7293 }
7294
7295 SV*
7296 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7297 {
7298     struct regexp *const rx = ReANY(r);
7299     GET_RE_DEBUG_FLAGS_DECL;
7300
7301     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7302
7303     if (rx && RXp_PAREN_NAMES(rx)) {
7304         HV *hv = RXp_PAREN_NAMES(rx);
7305         HE *temphe;
7306         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7307             IV i;
7308             IV parno = 0;
7309             SV* sv_dat = HeVAL(temphe);
7310             I32 *nums = (I32*)SvPVX(sv_dat);
7311             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7312                 if ((I32)(rx->lastparen) >= nums[i] &&
7313                     rx->offs[nums[i]].start != -1 &&
7314                     rx->offs[nums[i]].end != -1)
7315                 {
7316                     parno = nums[i];
7317                     break;
7318                 }
7319             }
7320             if (parno || flags & RXapif_ALL) {
7321                 return newSVhek(HeKEY_hek(temphe));
7322             }
7323         }
7324     }
7325     return NULL;
7326 }
7327
7328 SV*
7329 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7330 {
7331     SV *ret;
7332     AV *av;
7333     SSize_t length;
7334     struct regexp *const rx = ReANY(r);
7335
7336     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7337
7338     if (rx && RXp_PAREN_NAMES(rx)) {
7339         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7340             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7341         } else if (flags & RXapif_ONE) {
7342             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7343             av = MUTABLE_AV(SvRV(ret));
7344             length = av_tindex(av);
7345             SvREFCNT_dec_NN(ret);
7346             return newSViv(length + 1);
7347         } else {
7348             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7349                                                 (int)flags);
7350             return NULL;
7351         }
7352     }
7353     return &PL_sv_undef;
7354 }
7355
7356 SV*
7357 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7358 {
7359     struct regexp *const rx = ReANY(r);
7360     AV *av = newAV();
7361
7362     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7363
7364     if (rx && RXp_PAREN_NAMES(rx)) {
7365         HV *hv= RXp_PAREN_NAMES(rx);
7366         HE *temphe;
7367         (void)hv_iterinit(hv);
7368         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7369             IV i;
7370             IV parno = 0;
7371             SV* sv_dat = HeVAL(temphe);
7372             I32 *nums = (I32*)SvPVX(sv_dat);
7373             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7374                 if ((I32)(rx->lastparen) >= nums[i] &&
7375                     rx->offs[nums[i]].start != -1 &&
7376                     rx->offs[nums[i]].end != -1)
7377                 {
7378                     parno = nums[i];
7379                     break;
7380                 }
7381             }
7382             if (parno || flags & RXapif_ALL) {
7383                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7384             }
7385         }
7386     }
7387
7388     return newRV_noinc(MUTABLE_SV(av));
7389 }
7390
7391 void
7392 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7393                              SV * const sv)
7394 {
7395     struct regexp *const rx = ReANY(r);
7396     char *s = NULL;
7397     SSize_t i = 0;
7398     SSize_t s1, t1;
7399     I32 n = paren;
7400
7401     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7402
7403     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7404            || n == RX_BUFF_IDX_CARET_FULLMATCH
7405            || n == RX_BUFF_IDX_CARET_POSTMATCH
7406        )
7407     {
7408         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7409         if (!keepcopy) {
7410             /* on something like
7411              *    $r = qr/.../;
7412              *    /$qr/p;
7413              * the KEEPCOPY is set on the PMOP rather than the regex */
7414             if (PL_curpm && r == PM_GETRE(PL_curpm))
7415                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7416         }
7417         if (!keepcopy)
7418             goto ret_undef;
7419     }
7420
7421     if (!rx->subbeg)
7422         goto ret_undef;
7423
7424     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7425         /* no need to distinguish between them any more */
7426         n = RX_BUFF_IDX_FULLMATCH;
7427
7428     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7429         && rx->offs[0].start != -1)
7430     {
7431         /* $`, ${^PREMATCH} */
7432         i = rx->offs[0].start;
7433         s = rx->subbeg;
7434     }
7435     else
7436     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7437         && rx->offs[0].end != -1)
7438     {
7439         /* $', ${^POSTMATCH} */
7440         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7441         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7442     }
7443     else
7444     if ( 0 <= n && n <= (I32)rx->nparens &&
7445         (s1 = rx->offs[n].start) != -1 &&
7446         (t1 = rx->offs[n].end) != -1)
7447     {
7448         /* $&, ${^MATCH},  $1 ... */
7449         i = t1 - s1;
7450         s = rx->subbeg + s1 - rx->suboffset;
7451     } else {
7452         goto ret_undef;
7453     }
7454
7455     assert(s >= rx->subbeg);
7456     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7457     if (i >= 0) {
7458 #ifdef NO_TAINT_SUPPORT
7459         sv_setpvn(sv, s, i);
7460 #else
7461         const int oldtainted = TAINT_get;
7462         TAINT_NOT;
7463         sv_setpvn(sv, s, i);
7464         TAINT_set(oldtainted);
7465 #endif
7466         if ( (rx->intflags & PREGf_CANY_SEEN)
7467             ? (RXp_MATCH_UTF8(rx)
7468                         && (!i || is_utf8_string((U8*)s, i)))
7469             : (RXp_MATCH_UTF8(rx)) )
7470         {
7471             SvUTF8_on(sv);
7472         }
7473         else
7474             SvUTF8_off(sv);
7475         if (TAINTING_get) {
7476             if (RXp_MATCH_TAINTED(rx)) {
7477                 if (SvTYPE(sv) >= SVt_PVMG) {
7478                     MAGIC* const mg = SvMAGIC(sv);
7479                     MAGIC* mgt;
7480                     TAINT;
7481                     SvMAGIC_set(sv, mg->mg_moremagic);
7482                     SvTAINT(sv);
7483                     if ((mgt = SvMAGIC(sv))) {
7484                         mg->mg_moremagic = mgt;
7485                         SvMAGIC_set(sv, mg);
7486                     }
7487                 } else {
7488                     TAINT;
7489                     SvTAINT(sv);
7490                 }
7491             } else
7492                 SvTAINTED_off(sv);
7493         }
7494     } else {
7495       ret_undef:
7496         sv_setsv(sv,&PL_sv_undef);
7497         return;
7498     }
7499 }
7500
7501 void
7502 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7503                                                          SV const * const value)
7504 {
7505     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7506
7507     PERL_UNUSED_ARG(rx);
7508     PERL_UNUSED_ARG(paren);
7509     PERL_UNUSED_ARG(value);
7510
7511     if (!PL_localizing)
7512         Perl_croak_no_modify();
7513 }
7514
7515 I32
7516 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7517                               const I32 paren)
7518 {
7519     struct regexp *const rx = ReANY(r);
7520     I32 i;
7521     I32 s1, t1;
7522
7523     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7524
7525     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7526         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7527         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7528     )
7529     {
7530         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7531         if (!keepcopy) {
7532             /* on something like
7533              *    $r = qr/.../;
7534              *    /$qr/p;
7535              * the KEEPCOPY is set on the PMOP rather than the regex */
7536             if (PL_curpm && r == PM_GETRE(PL_curpm))
7537                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7538         }
7539         if (!keepcopy)
7540             goto warn_undef;
7541     }
7542
7543     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7544     switch (paren) {
7545       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7546       case RX_BUFF_IDX_PREMATCH:       /* $` */
7547         if (rx->offs[0].start != -1) {
7548                         i = rx->offs[0].start;
7549                         if (i > 0) {
7550                                 s1 = 0;
7551                                 t1 = i;
7552                                 goto getlen;
7553                         }
7554             }
7555         return 0;
7556
7557       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7558       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7559             if (rx->offs[0].end != -1) {
7560                         i = rx->sublen - rx->offs[0].end;
7561                         if (i > 0) {
7562                                 s1 = rx->offs[0].end;
7563                                 t1 = rx->sublen;
7564                                 goto getlen;
7565                         }
7566             }
7567         return 0;
7568
7569       default: /* $& / ${^MATCH}, $1, $2, ... */
7570             if (paren <= (I32)rx->nparens &&
7571             (s1 = rx->offs[paren].start) != -1 &&
7572             (t1 = rx->offs[paren].end) != -1)
7573             {
7574             i = t1 - s1;
7575             goto getlen;
7576         } else {
7577           warn_undef:
7578             if (ckWARN(WARN_UNINITIALIZED))
7579                 report_uninit((const SV *)sv);
7580             return 0;
7581         }
7582     }
7583   getlen:
7584     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7585         const char * const s = rx->subbeg - rx->suboffset + s1;
7586         const U8 *ep;
7587         STRLEN el;
7588
7589         i = t1 - s1;
7590         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7591                         i = el;
7592     }
7593     return i;
7594 }
7595
7596 SV*
7597 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7598 {
7599     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7600         PERL_UNUSED_ARG(rx);
7601         if (0)
7602             return NULL;
7603         else
7604             return newSVpvs("Regexp");
7605 }
7606
7607 /* Scans the name of a named buffer from the pattern.
7608  * If flags is REG_RSN_RETURN_NULL returns null.
7609  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7610  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7611  * to the parsed name as looked up in the RExC_paren_names hash.
7612  * If there is an error throws a vFAIL().. type exception.
7613  */
7614
7615 #define REG_RSN_RETURN_NULL    0
7616 #define REG_RSN_RETURN_NAME    1
7617 #define REG_RSN_RETURN_DATA    2
7618
7619 STATIC SV*
7620 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7621 {
7622     char *name_start = RExC_parse;
7623
7624     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7625
7626     assert (RExC_parse <= RExC_end);
7627     if (RExC_parse == RExC_end) NOOP;
7628     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7629          /* skip IDFIRST by using do...while */
7630         if (UTF)
7631             do {
7632                 RExC_parse += UTF8SKIP(RExC_parse);
7633             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7634         else
7635             do {
7636                 RExC_parse++;
7637             } while (isWORDCHAR(*RExC_parse));
7638     } else {
7639         RExC_parse++; /* so the <- from the vFAIL is after the offending
7640                          character */
7641         vFAIL("Group name must start with a non-digit word character");
7642     }
7643     if ( flags ) {
7644         SV* sv_name
7645             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7646                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7647         if ( flags == REG_RSN_RETURN_NAME)
7648             return sv_name;
7649         else if (flags==REG_RSN_RETURN_DATA) {
7650             HE *he_str = NULL;
7651             SV *sv_dat = NULL;
7652             if ( ! sv_name )      /* should not happen*/
7653                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7654             if (RExC_paren_names)
7655                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7656             if ( he_str )
7657                 sv_dat = HeVAL(he_str);
7658             if ( ! sv_dat )
7659                 vFAIL("Reference to nonexistent named group");
7660             return sv_dat;
7661         }
7662         else {
7663             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7664                        (unsigned long) flags);
7665         }
7666         assert(0); /* NOT REACHED */
7667     }
7668     return NULL;
7669 }
7670
7671 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7672     int rem=(int)(RExC_end - RExC_parse);                       \
7673     int cut;                                                    \
7674     int num;                                                    \
7675     int iscut=0;                                                \
7676     if (rem>10) {                                               \
7677         rem=10;                                                 \
7678         iscut=1;                                                \
7679     }                                                           \
7680     cut=10-rem;                                                 \
7681     if (RExC_lastparse!=RExC_parse)                             \
7682         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7683             rem, RExC_parse,                                    \
7684             cut + 4,                                            \
7685             iscut ? "..." : "<"                                 \
7686         );                                                      \
7687     else                                                        \
7688         PerlIO_printf(Perl_debug_log,"%16s","");                \
7689                                                                 \
7690     if (SIZE_ONLY)                                              \
7691        num = RExC_size + 1;                                     \
7692     else                                                        \
7693        num=REG_NODE_NUM(RExC_emit);                             \
7694     if (RExC_lastnum!=num)                                      \
7695        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7696     else                                                        \
7697        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7698     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7699         (int)((depth*2)), "",                                   \
7700         (funcname)                                              \
7701     );                                                          \
7702     RExC_lastnum=num;                                           \
7703     RExC_lastparse=RExC_parse;                                  \
7704 })
7705
7706
7707
7708 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7709     DEBUG_PARSE_MSG((funcname));                            \
7710     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7711 })
7712 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7713     DEBUG_PARSE_MSG((funcname));                            \
7714     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7715 })
7716
7717 /* This section of code defines the inversion list object and its methods.  The
7718  * interfaces are highly subject to change, so as much as possible is static to
7719  * this file.  An inversion list is here implemented as a malloc'd C UV array
7720  * as an SVt_INVLIST scalar.
7721  *
7722  * An inversion list for Unicode is an array of code points, sorted by ordinal
7723  * number.  The zeroth element is the first code point in the list.  The 1th
7724  * element is the first element beyond that not in the list.  In other words,
7725  * the first range is
7726  *  invlist[0]..(invlist[1]-1)
7727  * The other ranges follow.  Thus every element whose index is divisible by two
7728  * marks the beginning of a range that is in the list, and every element not
7729  * divisible by two marks the beginning of a range not in the list.  A single
7730  * element inversion list that contains the single code point N generally
7731  * consists of two elements
7732  *  invlist[0] == N
7733  *  invlist[1] == N+1
7734  * (The exception is when N is the highest representable value on the
7735  * machine, in which case the list containing just it would be a single
7736  * element, itself.  By extension, if the last range in the list extends to
7737  * infinity, then the first element of that range will be in the inversion list
7738  * at a position that is divisible by two, and is the final element in the
7739  * list.)
7740  * Taking the complement (inverting) an inversion list is quite simple, if the
7741  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7742  * This implementation reserves an element at the beginning of each inversion
7743  * list to always contain 0; there is an additional flag in the header which
7744  * indicates if the list begins at the 0, or is offset to begin at the next
7745  * element.
7746  *
7747  * More about inversion lists can be found in "Unicode Demystified"
7748  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7749  * More will be coming when functionality is added later.
7750  *
7751  * The inversion list data structure is currently implemented as an SV pointing
7752  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7753  * array of UV whose memory management is automatically handled by the existing
7754  * facilities for SV's.
7755  *
7756  * Some of the methods should always be private to the implementation, and some
7757  * should eventually be made public */
7758
7759 /* The header definitions are in F<inline_invlist.c> */
7760
7761 PERL_STATIC_INLINE UV*
7762 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7763 {
7764     /* Returns a pointer to the first element in the inversion list's array.
7765      * This is called upon initialization of an inversion list.  Where the
7766      * array begins depends on whether the list has the code point U+0000 in it
7767      * or not.  The other parameter tells it whether the code that follows this
7768      * call is about to put a 0 in the inversion list or not.  The first
7769      * element is either the element reserved for 0, if TRUE, or the element
7770      * after it, if FALSE */
7771
7772     bool* offset = get_invlist_offset_addr(invlist);
7773     UV* zero_addr = (UV *) SvPVX(invlist);
7774
7775     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7776
7777     /* Must be empty */
7778     assert(! _invlist_len(invlist));
7779
7780     *zero_addr = 0;
7781
7782     /* 1^1 = 0; 1^0 = 1 */
7783     *offset = 1 ^ will_have_0;
7784     return zero_addr + *offset;
7785 }
7786
7787 PERL_STATIC_INLINE UV*
7788 S_invlist_array(pTHX_ SV* const invlist)
7789 {
7790     /* Returns the pointer to the inversion list's array.  Every time the
7791      * length changes, this needs to be called in case malloc or realloc moved
7792      * it */
7793
7794     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7795
7796     /* Must not be empty.  If these fail, you probably didn't check for <len>
7797      * being non-zero before trying to get the array */
7798     assert(_invlist_len(invlist));
7799
7800     /* The very first element always contains zero, The array begins either
7801      * there, or if the inversion list is offset, at the element after it.
7802      * The offset header field determines which; it contains 0 or 1 to indicate
7803      * how much additionally to add */
7804     assert(0 == *(SvPVX(invlist)));
7805     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7806 }
7807
7808 PERL_STATIC_INLINE void
7809 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7810 {
7811     /* Sets the current number of elements stored in the inversion list.
7812      * Updates SvCUR correspondingly */
7813
7814     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7815
7816     assert(SvTYPE(invlist) == SVt_INVLIST);
7817
7818     SvCUR_set(invlist,
7819               (len == 0)
7820                ? 0
7821                : TO_INTERNAL_SIZE(len + offset));
7822     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7823 }
7824
7825 PERL_STATIC_INLINE IV*
7826 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7827 {
7828     /* Return the address of the IV that is reserved to hold the cached index
7829      * */
7830
7831     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7832
7833     assert(SvTYPE(invlist) == SVt_INVLIST);
7834
7835     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7836 }
7837
7838 PERL_STATIC_INLINE IV
7839 S_invlist_previous_index(pTHX_ SV* const invlist)
7840 {
7841     /* Returns cached index of previous search */
7842
7843     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7844
7845     return *get_invlist_previous_index_addr(invlist);
7846 }
7847
7848 PERL_STATIC_INLINE void
7849 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7850 {
7851     /* Caches <index> for later retrieval */
7852
7853     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7854
7855     assert(index == 0 || index < (int) _invlist_len(invlist));
7856
7857     *get_invlist_previous_index_addr(invlist) = index;
7858 }
7859
7860 PERL_STATIC_INLINE UV
7861 S_invlist_max(pTHX_ SV* const invlist)
7862 {
7863     /* Returns the maximum number of elements storable in the inversion list's
7864      * array, without having to realloc() */
7865
7866     PERL_ARGS_ASSERT_INVLIST_MAX;
7867
7868     assert(SvTYPE(invlist) == SVt_INVLIST);
7869
7870     /* Assumes worst case, in which the 0 element is not counted in the
7871      * inversion list, so subtracts 1 for that */
7872     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7873            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7874            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7875 }
7876
7877 #ifndef PERL_IN_XSUB_RE
7878 SV*
7879 Perl__new_invlist(pTHX_ IV initial_size)
7880 {
7881
7882     /* Return a pointer to a newly constructed inversion list, with enough
7883      * space to store 'initial_size' elements.  If that number is negative, a
7884      * system default is used instead */
7885
7886     SV* new_list;
7887
7888     if (initial_size < 0) {
7889         initial_size = 10;
7890     }
7891
7892     /* Allocate the initial space */
7893     new_list = newSV_type(SVt_INVLIST);
7894
7895     /* First 1 is in case the zero element isn't in the list; second 1 is for
7896      * trailing NUL */
7897     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7898     invlist_set_len(new_list, 0, 0);
7899
7900     /* Force iterinit() to be used to get iteration to work */
7901     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7902
7903     *get_invlist_previous_index_addr(new_list) = 0;
7904
7905     return new_list;
7906 }
7907
7908 SV*
7909 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7910 {
7911     /* Return a pointer to a newly constructed inversion list, initialized to
7912      * point to <list>, which has to be in the exact correct inversion list
7913      * form, including internal fields.  Thus this is a dangerous routine that
7914      * should not be used in the wrong hands.  The passed in 'list' contains
7915      * several header fields at the beginning that are not part of the
7916      * inversion list body proper */
7917
7918     const STRLEN length = (STRLEN) list[0];
7919     const UV version_id =          list[1];
7920     const bool offset   =    cBOOL(list[2]);
7921 #define HEADER_LENGTH 3
7922     /* If any of the above changes in any way, you must change HEADER_LENGTH
7923      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7924      *      perl -E 'say int(rand 2**31-1)'
7925      */
7926 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7927                                         data structure type, so that one being
7928                                         passed in can be validated to be an
7929                                         inversion list of the correct vintage.
7930                                        */
7931
7932     SV* invlist = newSV_type(SVt_INVLIST);
7933
7934     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7935
7936     if (version_id != INVLIST_VERSION_ID) {
7937         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7938     }
7939
7940     /* The generated array passed in includes header elements that aren't part
7941      * of the list proper, so start it just after them */
7942     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7943
7944     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7945                                shouldn't touch it */
7946
7947     *(get_invlist_offset_addr(invlist)) = offset;
7948
7949     /* The 'length' passed to us is the physical number of elements in the
7950      * inversion list.  But if there is an offset the logical number is one
7951      * less than that */
7952     invlist_set_len(invlist, length  - offset, offset);
7953
7954     invlist_set_previous_index(invlist, 0);
7955
7956     /* Initialize the iteration pointer. */
7957     invlist_iterfinish(invlist);
7958
7959     SvREADONLY_on(invlist);
7960
7961     return invlist;
7962 }
7963 #endif /* ifndef PERL_IN_XSUB_RE */
7964
7965 STATIC void
7966 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7967 {
7968     /* Grow the maximum size of an inversion list */
7969
7970     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7971
7972     assert(SvTYPE(invlist) == SVt_INVLIST);
7973
7974     /* Add one to account for the zero element at the beginning which may not
7975      * be counted by the calling parameters */
7976     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7977 }
7978
7979 PERL_STATIC_INLINE void
7980 S_invlist_trim(pTHX_ SV* const invlist)
7981 {
7982     PERL_ARGS_ASSERT_INVLIST_TRIM;
7983
7984     assert(SvTYPE(invlist) == SVt_INVLIST);
7985
7986     /* Change the length of the inversion list to how many entries it currently
7987      * has */
7988     SvPV_shrink_to_cur((SV *) invlist);
7989 }
7990
7991 STATIC void
7992 S__append_range_to_invlist(pTHX_ SV* const invlist,
7993                                  const UV start, const UV end)
7994 {
7995    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7996     * the end of the inversion list.  The range must be above any existing
7997     * ones. */
7998
7999     UV* array;
8000     UV max = invlist_max(invlist);
8001     UV len = _invlist_len(invlist);
8002     bool offset;
8003
8004     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8005
8006     if (len == 0) { /* Empty lists must be initialized */
8007         offset = start != 0;
8008         array = _invlist_array_init(invlist, ! offset);
8009     }
8010     else {
8011         /* Here, the existing list is non-empty. The current max entry in the
8012          * list is generally the first value not in the set, except when the
8013          * set extends to the end of permissible values, in which case it is
8014          * the first entry in that final set, and so this call is an attempt to
8015          * append out-of-order */
8016
8017         UV final_element = len - 1;
8018         array = invlist_array(invlist);
8019         if (array[final_element] > start
8020             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8021         {
8022             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8023                      array[final_element], start,
8024                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8025         }
8026
8027         /* Here, it is a legal append.  If the new range begins with the first
8028          * value not in the set, it is extending the set, so the new first
8029          * value not in the set is one greater than the newly extended range.
8030          * */
8031         offset = *get_invlist_offset_addr(invlist);
8032         if (array[final_element] == start) {
8033             if (end != UV_MAX) {
8034                 array[final_element] = end + 1;
8035             }
8036             else {
8037                 /* But if the end is the maximum representable on the machine,
8038                  * just let the range that this would extend to have no end */
8039                 invlist_set_len(invlist, len - 1, offset);
8040             }
8041             return;
8042         }
8043     }
8044
8045     /* Here the new range doesn't extend any existing set.  Add it */
8046
8047     len += 2;   /* Includes an element each for the start and end of range */
8048
8049     /* If wll overflow the existing space, extend, which may cause the array to
8050      * be moved */
8051     if (max < len) {
8052         invlist_extend(invlist, len);
8053
8054         /* Have to set len here to avoid assert failure in invlist_array() */
8055         invlist_set_len(invlist, len, offset);
8056
8057         array = invlist_array(invlist);
8058     }
8059     else {
8060         invlist_set_len(invlist, len, offset);
8061     }
8062
8063     /* The next item on the list starts the range, the one after that is
8064      * one past the new range.  */
8065     array[len - 2] = start;
8066     if (end != UV_MAX) {
8067         array[len - 1] = end + 1;
8068     }
8069     else {
8070         /* But if the end is the maximum representable on the machine, just let
8071          * the range have no end */
8072         invlist_set_len(invlist, len - 1, offset);
8073     }
8074 }
8075
8076 #ifndef PERL_IN_XSUB_RE
8077
8078 IV
8079 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8080 {
8081     /* Searches the inversion list for the entry that contains the input code
8082      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8083      * return value is the index into the list's array of the range that
8084      * contains <cp> */
8085
8086     IV low = 0;
8087     IV mid;
8088     IV high = _invlist_len(invlist);
8089     const IV highest_element = high - 1;
8090     const UV* array;
8091
8092     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8093
8094     /* If list is empty, return failure. */
8095     if (high == 0) {
8096         return -1;
8097     }
8098
8099     /* (We can't get the array unless we know the list is non-empty) */
8100     array = invlist_array(invlist);
8101
8102     mid = invlist_previous_index(invlist);
8103     assert(mid >=0 && mid <= highest_element);
8104
8105     /* <mid> contains the cache of the result of the previous call to this
8106      * function (0 the first time).  See if this call is for the same result,
8107      * or if it is for mid-1.  This is under the theory that calls to this
8108      * function will often be for related code points that are near each other.
8109      * And benchmarks show that caching gives better results.  We also test
8110      * here if the code point is within the bounds of the list.  These tests
8111      * replace others that would have had to be made anyway to make sure that
8112      * the array bounds were not exceeded, and these give us extra information
8113      * at the same time */
8114     if (cp >= array[mid]) {
8115         if (cp >= array[highest_element]) {
8116             return highest_element;
8117         }
8118
8119         /* Here, array[mid] <= cp < array[highest_element].  This means that
8120          * the final element is not the answer, so can exclude it; it also
8121          * means that <mid> is not the final element, so can refer to 'mid + 1'
8122          * safely */
8123         if (cp < array[mid + 1]) {
8124             return mid;
8125         }
8126         high--;
8127         low = mid + 1;
8128     }
8129     else { /* cp < aray[mid] */
8130         if (cp < array[0]) { /* Fail if outside the array */
8131             return -1;
8132         }
8133         high = mid;
8134         if (cp >= array[mid - 1]) {
8135             goto found_entry;
8136         }
8137     }
8138
8139     /* Binary search.  What we are looking for is <i> such that
8140      *  array[i] <= cp < array[i+1]
8141      * The loop below converges on the i+1.  Note that there may not be an
8142      * (i+1)th element in the array, and things work nonetheless */
8143     while (low < high) {
8144         mid = (low + high) / 2;
8145         assert(mid <= highest_element);
8146         if (array[mid] <= cp) { /* cp >= array[mid] */
8147             low = mid + 1;
8148
8149             /* We could do this extra test to exit the loop early.
8150             if (cp < array[low]) {
8151                 return mid;
8152             }
8153             */
8154         }
8155         else { /* cp < array[mid] */
8156             high = mid;
8157         }
8158     }
8159
8160   found_entry:
8161     high--;
8162     invlist_set_previous_index(invlist, high);
8163     return high;
8164 }
8165
8166 void
8167 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8168                                     const UV start, const UV end, U8* swatch)
8169 {
8170     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8171      * but is used when the swash has an inversion list.  This makes this much
8172      * faster, as it uses a binary search instead of a linear one.  This is
8173      * intimately tied to that function, and perhaps should be in utf8.c,
8174      * except it is intimately tied to inversion lists as well.  It assumes
8175      * that <swatch> is all 0's on input */
8176
8177     UV current = start;
8178     const IV len = _invlist_len(invlist);
8179     IV i;
8180     const UV * array;
8181
8182     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8183
8184     if (len == 0) { /* Empty inversion list */
8185         return;
8186     }
8187
8188     array = invlist_array(invlist);
8189
8190     /* Find which element it is */
8191     i = _invlist_search(invlist, start);
8192
8193     /* We populate from <start> to <end> */
8194     while (current < end) {
8195         UV upper;
8196
8197         /* The inversion list gives the results for every possible code point
8198          * after the first one in the list.  Only those ranges whose index is
8199          * even are ones that the inversion list matches.  For the odd ones,
8200          * and if the initial code point is not in the list, we have to skip
8201          * forward to the next element */
8202         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8203             i++;
8204             if (i >= len) { /* Finished if beyond the end of the array */
8205                 return;
8206             }
8207             current = array[i];
8208             if (current >= end) {   /* Finished if beyond the end of what we
8209                                        are populating */
8210                 if (LIKELY(end < UV_MAX)) {
8211                     return;
8212                 }
8213
8214                 /* We get here when the upper bound is the maximum
8215                  * representable on the machine, and we are looking for just
8216                  * that code point.  Have to special case it */
8217                 i = len;
8218                 goto join_end_of_list;
8219             }
8220         }
8221         assert(current >= start);
8222
8223         /* The current range ends one below the next one, except don't go past
8224          * <end> */
8225         i++;
8226         upper = (i < len && array[i] < end) ? array[i] : end;
8227
8228         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8229          * for each code point in it */
8230         for (; current < upper; current++) {
8231             const STRLEN offset = (STRLEN)(current - start);
8232             swatch[offset >> 3] |= 1 << (offset & 7);
8233         }
8234
8235     join_end_of_list:
8236
8237         /* Quit if at the end of the list */
8238         if (i >= len) {
8239
8240             /* But first, have to deal with the highest possible code point on
8241              * the platform.  The previous code assumes that <end> is one
8242              * beyond where we want to populate, but that is impossible at the
8243              * platform's infinity, so have to handle it specially */
8244             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8245             {
8246                 const STRLEN offset = (STRLEN)(end - start);
8247                 swatch[offset >> 3] |= 1 << (offset & 7);
8248             }
8249             return;
8250         }
8251
8252         /* Advance to the next range, which will be for code points not in the
8253          * inversion list */
8254         current = array[i];
8255     }
8256
8257     return;
8258 }
8259
8260 void
8261 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8262                                          const bool complement_b, SV** output)
8263 {
8264     /* Take the union of two inversion lists and point <output> to it.  *output
8265      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8266      * the reference count to that list will be decremented if not already a
8267      * temporary (mortal); otherwise *output will be made correspondingly
8268      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8269      * second list is returned.  If <complement_b> is TRUE, the union is taken
8270      * of the complement (inversion) of <b> instead of b itself.
8271      *
8272      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8273      * Richard Gillam, published by Addison-Wesley, and explained at some
8274      * length there.  The preface says to incorporate its examples into your
8275      * code at your own risk.
8276      *
8277      * The algorithm is like a merge sort.
8278      *
8279      * XXX A potential performance improvement is to keep track as we go along
8280      * if only one of the inputs contributes to the result, meaning the other
8281      * is a subset of that one.  In that case, we can skip the final copy and
8282      * return the larger of the input lists, but then outside code might need
8283      * to keep track of whether to free the input list or not */
8284
8285     const UV* array_a;    /* a's array */
8286     const UV* array_b;
8287     UV len_a;       /* length of a's array */
8288     UV len_b;
8289
8290     SV* u;                      /* the resulting union */
8291     UV* array_u;
8292     UV len_u;
8293
8294     UV i_a = 0;             /* current index into a's array */
8295     UV i_b = 0;
8296     UV i_u = 0;
8297
8298     /* running count, as explained in the algorithm source book; items are
8299      * stopped accumulating and are output when the count changes to/from 0.
8300      * The count is incremented when we start a range that's in the set, and
8301      * decremented when we start a range that's not in the set.  So its range
8302      * is 0 to 2.  Only when the count is zero is something not in the set.
8303      */
8304     UV count = 0;
8305
8306     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8307     assert(a != b);
8308
8309     /* If either one is empty, the union is the other one */
8310     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8311         bool make_temp = FALSE; /* Should we mortalize the result? */
8312
8313         if (*output == a) {
8314             if (a != NULL) {
8315                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8316                     SvREFCNT_dec_NN(a);
8317                 }
8318             }
8319         }
8320         if (*output != b) {
8321             *output = invlist_clone(b);
8322             if (complement_b) {
8323                 _invlist_invert(*output);
8324             }
8325         } /* else *output already = b; */
8326
8327         if (make_temp) {
8328             sv_2mortal(*output);
8329         }
8330         return;
8331     }
8332     else if ((len_b = _invlist_len(b)) == 0) {
8333         bool make_temp = FALSE;
8334         if (*output == b) {
8335             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8336                 SvREFCNT_dec_NN(b);
8337             }
8338         }
8339
8340         /* The complement of an empty list is a list that has everything in it,
8341          * so the union with <a> includes everything too */
8342         if (complement_b) {
8343             if (a == *output) {
8344                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8345                     SvREFCNT_dec_NN(a);
8346                 }
8347             }
8348             *output = _new_invlist(1);
8349             _append_range_to_invlist(*output, 0, UV_MAX);
8350         }
8351         else if (*output != a) {
8352             *output = invlist_clone(a);
8353         }
8354         /* else *output already = a; */
8355
8356         if (make_temp) {
8357             sv_2mortal(*output);
8358         }
8359         return;
8360     }
8361
8362     /* Here both lists exist and are non-empty */
8363     array_a = invlist_array(a);
8364     array_b = invlist_array(b);
8365
8366     /* If are to take the union of 'a' with the complement of b, set it
8367      * up so are looking at b's complement. */
8368     if (complement_b) {
8369
8370         /* To complement, we invert: if the first element is 0, remove it.  To
8371          * do this, we just pretend the array starts one later */
8372         if (array_b[0] == 0) {
8373             array_b++;
8374             len_b--;
8375         }
8376         else {
8377
8378             /* But if the first element is not zero, we pretend the list starts
8379              * at the 0 that is always stored immediately before the array. */
8380             array_b--;
8381             len_b++;
8382         }
8383     }
8384
8385     /* Size the union for the worst case: that the sets are completely
8386      * disjoint */
8387     u = _new_invlist(len_a + len_b);
8388
8389     /* Will contain U+0000 if either component does */
8390     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8391                                       || (len_b > 0 && array_b[0] == 0));
8392
8393     /* Go through each list item by item, stopping when exhausted one of
8394      * them */
8395     while (i_a < len_a && i_b < len_b) {
8396         UV cp;      /* The element to potentially add to the union's array */
8397         bool cp_in_set;   /* is it in the the input list's set or not */
8398
8399         /* We need to take one or the other of the two inputs for the union.
8400          * Since we are merging two sorted lists, we take the smaller of the
8401          * next items.  In case of a tie, we take the one that is in its set
8402          * first.  If we took one not in the set first, it would decrement the
8403          * count, possibly to 0 which would cause it to be output as ending the
8404          * range, and the next time through we would take the same number, and
8405          * output it again as beginning the next range.  By doing it the
8406          * opposite way, there is no possibility that the count will be
8407          * momentarily decremented to 0, and thus the two adjoining ranges will
8408          * be seamlessly merged.  (In a tie and both are in the set or both not
8409          * in the set, it doesn't matter which we take first.) */
8410         if (array_a[i_a] < array_b[i_b]
8411             || (array_a[i_a] == array_b[i_b]
8412                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8413         {
8414             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8415             cp= array_a[i_a++];
8416         }
8417         else {
8418             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8419             cp = array_b[i_b++];
8420         }
8421
8422         /* Here, have chosen which of the two inputs to look at.  Only output
8423          * if the running count changes to/from 0, which marks the
8424          * beginning/end of a range in that's in the set */
8425         if (cp_in_set) {
8426             if (count == 0) {
8427                 array_u[i_u++] = cp;
8428             }
8429             count++;
8430         }
8431         else {
8432             count--;
8433             if (count == 0) {
8434                 array_u[i_u++] = cp;
8435             }
8436         }
8437     }
8438
8439     /* Here, we are finished going through at least one of the lists, which
8440      * means there is something remaining in at most one.  We check if the list
8441      * that hasn't been exhausted is positioned such that we are in the middle
8442      * of a range in its set or not.  (i_a and i_b point to the element beyond
8443      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8444      * is potentially more to output.
8445      * There are four cases:
8446      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8447      *     in the union is entirely from the non-exhausted set.
8448      *  2) Both were in their sets, count is 2.  Nothing further should
8449      *     be output, as everything that remains will be in the exhausted
8450      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8451      *     that
8452      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8453      *     Nothing further should be output because the union includes
8454      *     everything from the exhausted set.  Not decrementing ensures that.
8455      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8456      *     decrementing to 0 insures that we look at the remainder of the
8457      *     non-exhausted set */
8458     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8459         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8460     {
8461         count--;
8462     }
8463
8464     /* The final length is what we've output so far, plus what else is about to
8465      * be output.  (If 'count' is non-zero, then the input list we exhausted
8466      * has everything remaining up to the machine's limit in its set, and hence
8467      * in the union, so there will be no further output. */
8468     len_u = i_u;
8469     if (count == 0) {
8470         /* At most one of the subexpressions will be non-zero */
8471         len_u += (len_a - i_a) + (len_b - i_b);
8472     }
8473
8474     /* Set result to final length, which can change the pointer to array_u, so
8475      * re-find it */
8476     if (len_u != _invlist_len(u)) {
8477         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8478         invlist_trim(u);
8479         array_u = invlist_array(u);
8480     }
8481
8482     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8483      * the other) ended with everything above it not in its set.  That means
8484      * that the remaining part of the union is precisely the same as the
8485      * non-exhausted list, so can just copy it unchanged.  (If both list were
8486      * exhausted at the same time, then the operations below will be both 0.)
8487      */
8488     if (count == 0) {
8489         IV copy_count; /* At most one will have a non-zero copy count */
8490         if ((copy_count = len_a - i_a) > 0) {
8491             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8492         }
8493         else if ((copy_count = len_b - i_b) > 0) {
8494             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8495         }
8496     }
8497
8498     /*  We may be removing a reference to one of the inputs.  If so, the output
8499      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8500      *  count decremented) */
8501     if (a == *output || b == *output) {
8502         assert(! invlist_is_iterating(*output));
8503         if ((SvTEMP(*output))) {
8504             sv_2mortal(u);
8505         }
8506         else {
8507             SvREFCNT_dec_NN(*output);
8508         }
8509     }
8510
8511     *output = u;
8512
8513     return;
8514 }
8515
8516 void
8517 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8518                                                const bool complement_b, SV** i)
8519 {
8520     /* Take the intersection of two inversion lists and point <i> to it.  *i
8521      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8522      * the reference count to that list will be decremented if not already a
8523      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8524      * The first list, <a>, may be NULL, in which case an empty list is
8525      * returned.  If <complement_b> is TRUE, the result will be the
8526      * intersection of <a> and the complement (or inversion) of <b> instead of
8527      * <b> directly.
8528      *
8529      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8530      * Richard Gillam, published by Addison-Wesley, and explained at some
8531      * length there.  The preface says to incorporate its examples into your
8532      * code at your own risk.  In fact, it had bugs
8533      *
8534      * The algorithm is like a merge sort, and is essentially the same as the
8535      * union above
8536      */
8537
8538     const UV* array_a;          /* a's array */
8539     const UV* array_b;
8540     UV len_a;   /* length of a's array */
8541     UV len_b;
8542
8543     SV* r;                   /* the resulting intersection */
8544     UV* array_r;
8545     UV len_r;
8546
8547     UV i_a = 0;             /* current index into a's array */
8548     UV i_b = 0;
8549     UV i_r = 0;
8550
8551     /* running count, as explained in the algorithm source book; items are
8552      * stopped accumulating and are output when the count changes to/from 2.
8553      * The count is incremented when we start a range that's in the set, and
8554      * decremented when we start a range that's not in the set.  So its range
8555      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8556      */
8557     UV count = 0;
8558
8559     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8560     assert(a != b);
8561
8562     /* Special case if either one is empty */
8563     len_a = (a == NULL) ? 0 : _invlist_len(a);
8564     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8565         bool make_temp = FALSE;
8566
8567         if (len_a != 0 && complement_b) {
8568
8569             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8570              * be empty.  Here, also we are using 'b's complement, which hence
8571              * must be every possible code point.  Thus the intersection is
8572              * simply 'a'. */
8573             if (*i != a) {
8574                 if (*i == b) {
8575                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8576                         SvREFCNT_dec_NN(b);
8577                     }
8578                 }
8579
8580                 *i = invlist_clone(a);
8581             }
8582             /* else *i is already 'a' */
8583
8584             if (make_temp) {
8585                 sv_2mortal(*i);
8586             }
8587             return;
8588         }
8589
8590         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8591          * intersection must be empty */
8592         if (*i == a) {
8593             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8594                 SvREFCNT_dec_NN(a);
8595             }
8596         }
8597         else if (*i == b) {
8598             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8599                 SvREFCNT_dec_NN(b);
8600             }
8601         }
8602         *i = _new_invlist(0);
8603         if (make_temp) {
8604             sv_2mortal(*i);
8605         }
8606
8607         return;
8608     }
8609
8610     /* Here both lists exist and are non-empty */
8611     array_a = invlist_array(a);
8612     array_b = invlist_array(b);
8613
8614     /* If are to take the intersection of 'a' with the complement of b, set it
8615      * up so are looking at b's complement. */
8616     if (complement_b) {
8617
8618         /* To complement, we invert: if the first element is 0, remove it.  To
8619          * do this, we just pretend the array starts one later */
8620         if (array_b[0] == 0) {
8621             array_b++;
8622             len_b--;
8623         }
8624         else {
8625
8626             /* But if the first element is not zero, we pretend the list starts
8627              * at the 0 that is always stored immediately before the array. */
8628             array_b--;
8629             len_b++;
8630         }
8631     }
8632
8633     /* Size the intersection for the worst case: that the intersection ends up
8634      * fragmenting everything to be completely disjoint */
8635     r= _new_invlist(len_a + len_b);
8636
8637     /* Will contain U+0000 iff both components do */
8638     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8639                                      && len_b > 0 && array_b[0] == 0);
8640
8641     /* Go through each list item by item, stopping when exhausted one of
8642      * them */
8643     while (i_a < len_a && i_b < len_b) {
8644         UV cp;      /* The element to potentially add to the intersection's
8645                        array */
8646         bool cp_in_set; /* Is it in the input list's set or not */
8647
8648         /* We need to take one or the other of the two inputs for the
8649          * intersection.  Since we are merging two sorted lists, we take the
8650          * smaller of the next items.  In case of a tie, we take the one that
8651          * is not in its set first (a difference from the union algorithm).  If
8652          * we took one in the set first, it would increment the count, possibly
8653          * to 2 which would cause it to be output as starting a range in the
8654          * intersection, and the next time through we would take that same
8655          * number, and output it again as ending the set.  By doing it the
8656          * opposite of this, there is no possibility that the count will be
8657          * momentarily incremented to 2.  (In a tie and both are in the set or
8658          * both not in the set, it doesn't matter which we take first.) */
8659         if (array_a[i_a] < array_b[i_b]
8660             || (array_a[i_a] == array_b[i_b]
8661                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8662         {
8663             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8664             cp= array_a[i_a++];
8665         }
8666         else {
8667             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8668             cp= array_b[i_b++];
8669         }
8670
8671         /* Here, have chosen which of the two inputs to look at.  Only output
8672          * if the running count changes to/from 2, which marks the
8673          * beginning/end of a range that's in the intersection */
8674         if (cp_in_set) {
8675             count++;
8676             if (count == 2) {
8677                 array_r[i_r++] = cp;
8678             }
8679         }
8680         else {
8681             if (count == 2) {
8682                 array_r[i_r++] = cp;
8683             }
8684             count--;
8685         }
8686     }
8687
8688     /* Here, we are finished going through at least one of the lists, which
8689      * means there is something remaining in at most one.  We check if the list
8690      * that has been exhausted is positioned such that we are in the middle
8691      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8692      * the ones we care about.)  There are four cases:
8693      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8694      *     nothing left in the intersection.
8695      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8696      *     above 2.  What should be output is exactly that which is in the
8697      *     non-exhausted set, as everything it has is also in the intersection
8698      *     set, and everything it doesn't have can't be in the intersection
8699      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8700      *     gets incremented to 2.  Like the previous case, the intersection is
8701      *     everything that remains in the non-exhausted set.
8702      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8703      *     remains 1.  And the intersection has nothing more. */
8704     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8705         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8706     {
8707         count++;
8708     }
8709
8710     /* The final length is what we've output so far plus what else is in the
8711      * intersection.  At most one of the subexpressions below will be non-zero
8712      * */
8713     len_r = i_r;
8714     if (count >= 2) {
8715         len_r += (len_a - i_a) + (len_b - i_b);
8716     }
8717
8718     /* Set result to final length, which can change the pointer to array_r, so
8719      * re-find it */
8720     if (len_r != _invlist_len(r)) {
8721         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8722         invlist_trim(r);
8723         array_r = invlist_array(r);
8724     }
8725
8726     /* Finish outputting any remaining */
8727     if (count >= 2) { /* At most one will have a non-zero copy count */
8728         IV copy_count;
8729         if ((copy_count = len_a - i_a) > 0) {
8730             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8731         }
8732         else if ((copy_count = len_b - i_b) > 0) {
8733             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8734         }
8735     }
8736
8737     /*  We may be removing a reference to one of the inputs.  If so, the output
8738      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8739      *  count decremented) */
8740     if (a == *i || b == *i) {
8741         assert(! invlist_is_iterating(*i));
8742         if (SvTEMP(*i)) {
8743             sv_2mortal(r);
8744         }
8745         else {
8746             SvREFCNT_dec_NN(*i);
8747         }
8748     }
8749
8750     *i = r;
8751
8752     return;
8753 }
8754
8755 SV*
8756 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8757 {
8758     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8759      * set.  A pointer to the inversion list is returned.  This may actually be
8760      * a new list, in which case the passed in one has been destroyed.  The
8761      * passed in inversion list can be NULL, in which case a new one is created
8762      * with just the one range in it */
8763
8764     SV* range_invlist;
8765     UV len;
8766
8767     if (invlist == NULL) {
8768         invlist = _new_invlist(2);
8769         len = 0;
8770     }
8771     else {
8772         len = _invlist_len(invlist);
8773     }
8774
8775     /* If comes after the final entry actually in the list, can just append it
8776      * to the end, */
8777     if (len == 0
8778         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8779             && start >= invlist_array(invlist)[len - 1]))
8780     {
8781         _append_range_to_invlist(invlist, start, end);
8782         return invlist;
8783     }
8784
8785     /* Here, can't just append things, create and return a new inversion list
8786      * which is the union of this range and the existing inversion list */
8787     range_invlist = _new_invlist(2);
8788     _append_range_to_invlist(range_invlist, start, end);
8789
8790     _invlist_union(invlist, range_invlist, &invlist);
8791
8792     /* The temporary can be freed */
8793     SvREFCNT_dec_NN(range_invlist);
8794
8795     return invlist;
8796 }
8797
8798 SV*
8799 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8800                                  UV** other_elements_ptr)
8801 {
8802     /* Create and return an inversion list whose contents are to be populated
8803      * by the caller.  The caller gives the number of elements (in 'size') and
8804      * the very first element ('element0').  This function will set
8805      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8806      * are to be placed.
8807      *
8808      * Obviously there is some trust involved that the caller will properly
8809      * fill in the other elements of the array.
8810      *
8811      * (The first element needs to be passed in, as the underlying code does
8812      * things differently depending on whether it is zero or non-zero) */
8813
8814     SV* invlist = _new_invlist(size);
8815     bool offset;
8816
8817     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8818
8819     _append_range_to_invlist(invlist, element0, element0);
8820     offset = *get_invlist_offset_addr(invlist);
8821
8822     invlist_set_len(invlist, size, offset);
8823     *other_elements_ptr = invlist_array(invlist) + 1;
8824     return invlist;
8825 }
8826
8827 #endif
8828
8829 PERL_STATIC_INLINE SV*
8830 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8831     return _add_range_to_invlist(invlist, cp, cp);
8832 }
8833
8834 #ifndef PERL_IN_XSUB_RE
8835 void
8836 Perl__invlist_invert(pTHX_ SV* const invlist)
8837 {
8838     /* Complement the input inversion list.  This adds a 0 if the list didn't
8839      * have a zero; removes it otherwise.  As described above, the data
8840      * structure is set up so that this is very efficient */
8841
8842     PERL_ARGS_ASSERT__INVLIST_INVERT;
8843
8844     assert(! invlist_is_iterating(invlist));
8845
8846     /* The inverse of matching nothing is matching everything */
8847     if (_invlist_len(invlist) == 0) {
8848         _append_range_to_invlist(invlist, 0, UV_MAX);
8849         return;
8850     }
8851
8852     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8853 }
8854
8855 #endif
8856
8857 PERL_STATIC_INLINE SV*
8858 S_invlist_clone(pTHX_ SV* const invlist)
8859 {
8860
8861     /* Return a new inversion list that is a copy of the input one, which is
8862      * unchanged.  The new list will not be mortal even if the old one was. */
8863
8864     /* Need to allocate extra space to accommodate Perl's addition of a
8865      * trailing NUL to SvPV's, since it thinks they are always strings */
8866     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8867     STRLEN physical_length = SvCUR(invlist);
8868     bool offset = *(get_invlist_offset_addr(invlist));
8869
8870     PERL_ARGS_ASSERT_INVLIST_CLONE;
8871
8872     *(get_invlist_offset_addr(new_invlist)) = offset;
8873     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8874     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8875
8876     return new_invlist;
8877 }
8878
8879 PERL_STATIC_INLINE STRLEN*
8880 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8881 {
8882     /* Return the address of the UV that contains the current iteration
8883      * position */
8884
8885     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8886
8887     assert(SvTYPE(invlist) == SVt_INVLIST);
8888
8889     return &(((XINVLIST*) SvANY(invlist))->iterator);
8890 }
8891
8892 PERL_STATIC_INLINE void
8893 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8894 {
8895     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8896
8897     *get_invlist_iter_addr(invlist) = 0;
8898 }
8899
8900 PERL_STATIC_INLINE void
8901 S_invlist_iterfinish(pTHX_ SV* invlist)
8902 {
8903     /* Terminate iterator for invlist.  This is to catch development errors.
8904      * Any iteration that is interrupted before completed should call this
8905      * function.  Functions that add code points anywhere else but to the end
8906      * of an inversion list assert that they are not in the middle of an
8907      * iteration.  If they were, the addition would make the iteration
8908      * problematical: if the iteration hadn't reached the place where things
8909      * were being added, it would be ok */
8910
8911     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8912
8913     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8914 }
8915
8916 STATIC bool
8917 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8918 {
8919     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8920      * This call sets in <*start> and <*end>, the next range in <invlist>.
8921      * Returns <TRUE> if successful and the next call will return the next
8922      * range; <FALSE> if was already at the end of the list.  If the latter,
8923      * <*start> and <*end> are unchanged, and the next call to this function
8924      * will start over at the beginning of the list */
8925
8926     STRLEN* pos = get_invlist_iter_addr(invlist);
8927     UV len = _invlist_len(invlist);
8928     UV *array;
8929
8930     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8931
8932     if (*pos >= len) {
8933         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8934         return FALSE;
8935     }
8936
8937     array = invlist_array(invlist);
8938
8939     *start = array[(*pos)++];
8940
8941     if (*pos >= len) {
8942         *end = UV_MAX;
8943     }
8944     else {
8945         *end = array[(*pos)++] - 1;
8946     }
8947
8948     return TRUE;
8949 }
8950
8951 PERL_STATIC_INLINE bool
8952 S_invlist_is_iterating(pTHX_ SV* const invlist)
8953 {
8954     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8955
8956     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8957 }
8958
8959 PERL_STATIC_INLINE UV
8960 S_invlist_highest(pTHX_ SV* const invlist)
8961 {
8962     /* Returns the highest code point that matches an inversion list.  This API
8963      * has an ambiguity, as it returns 0 under either the highest is actually
8964      * 0, or if the list is empty.  If this distinction matters to you, check
8965      * for emptiness before calling this function */
8966
8967     UV len = _invlist_len(invlist);
8968     UV *array;
8969
8970     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8971
8972     if (len == 0) {
8973         return 0;
8974     }
8975
8976     array = invlist_array(invlist);
8977
8978     /* The last element in the array in the inversion list always starts a
8979      * range that goes to infinity.  That range may be for code points that are
8980      * matched in the inversion list, or it may be for ones that aren't
8981      * matched.  In the latter case, the highest code point in the set is one
8982      * less than the beginning of this range; otherwise it is the final element
8983      * of this range: infinity */
8984     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8985            ? UV_MAX
8986            : array[len - 1] - 1;
8987 }
8988
8989 #ifndef PERL_IN_XSUB_RE
8990 SV *
8991 Perl__invlist_contents(pTHX_ SV* const invlist)
8992 {
8993     /* Get the contents of an inversion list into a string SV so that they can
8994      * be printed out.  It uses the format traditionally done for debug tracing
8995      */
8996
8997     UV start, end;
8998     SV* output = newSVpvs("\n");
8999
9000     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9001
9002     assert(! invlist_is_iterating(invlist));
9003
9004     invlist_iterinit(invlist);
9005     while (invlist_iternext(invlist, &start, &end)) {
9006         if (end == UV_MAX) {
9007             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9008         }
9009         else if (end != start) {
9010             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9011                     start,       end);
9012         }
9013         else {
9014             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9015         }
9016     }
9017
9018     return output;
9019 }
9020 #endif
9021
9022 #ifndef PERL_IN_XSUB_RE
9023 void
9024 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9025                          const char * const indent, SV* const invlist)
9026 {
9027     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9028      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9029      * the string 'indent'.  The output looks like this:
9030          [0] 0x000A .. 0x000D
9031          [2] 0x0085
9032          [4] 0x2028 .. 0x2029
9033          [6] 0x3104 .. INFINITY
9034      * This means that the first range of code points matched by the list are
9035      * 0xA through 0xD; the second range contains only the single code point
9036      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9037      * are used to define each range (except if the final range extends to
9038      * infinity, only a single element is needed).  The array index of the
9039      * first element for the corresponding range is given in brackets. */
9040
9041     UV start, end;
9042     STRLEN count = 0;
9043
9044     PERL_ARGS_ASSERT__INVLIST_DUMP;
9045
9046     if (invlist_is_iterating(invlist)) {
9047         Perl_dump_indent(aTHX_ level, file,
9048              "%sCan't dump inversion list because is in middle of iterating\n",
9049              indent);
9050         return;
9051     }
9052
9053     invlist_iterinit(invlist);
9054     while (invlist_iternext(invlist, &start, &end)) {
9055         if (end == UV_MAX) {
9056             Perl_dump_indent(aTHX_ level, file,
9057                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9058                                    indent, (UV)count, start);
9059         }
9060         else if (end != start) {
9061             Perl_dump_indent(aTHX_ level, file,
9062                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9063                                 indent, (UV)count, start,         end);
9064         }
9065         else {
9066             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9067                                             indent, (UV)count, start);
9068         }
9069         count += 2;
9070     }
9071 }
9072
9073 void
9074 Perl__load_PL_utf8_foldclosures (pTHX)
9075 {
9076     assert(! PL_utf8_foldclosures);
9077
9078     /* If the folds haven't been read in, call a fold function
9079      * to force that */
9080     if (! PL_utf8_tofold) {
9081         U8 dummy[UTF8_MAXBYTES_CASE+1];
9082
9083         /* This string is just a short named one above \xff */
9084         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9085         assert(PL_utf8_tofold); /* Verify that worked */
9086     }
9087     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9088 }
9089 #endif
9090
9091 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9092 bool
9093 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9094 {
9095     /* Return a boolean as to if the two passed in inversion lists are
9096      * identical.  The final argument, if TRUE, says to take the complement of
9097      * the second inversion list before doing the comparison */
9098
9099     const UV* array_a = invlist_array(a);
9100     const UV* array_b = invlist_array(b);
9101     UV len_a = _invlist_len(a);
9102     UV len_b = _invlist_len(b);
9103
9104     UV i = 0;               /* current index into the arrays */
9105     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9106
9107     PERL_ARGS_ASSERT__INVLISTEQ;
9108
9109     /* If are to compare 'a' with the complement of b, set it
9110      * up so are looking at b's complement. */
9111     if (complement_b) {
9112
9113         /* The complement of nothing is everything, so <a> would have to have
9114          * just one element, starting at zero (ending at infinity) */
9115         if (len_b == 0) {
9116             return (len_a == 1 && array_a[0] == 0);
9117         }
9118         else if (array_b[0] == 0) {
9119
9120             /* Otherwise, to complement, we invert.  Here, the first element is
9121              * 0, just remove it.  To do this, we just pretend the array starts
9122              * one later */
9123
9124             array_b++;
9125             len_b--;
9126         }
9127         else {
9128
9129             /* But if the first element is not zero, we pretend the list starts
9130              * at the 0 that is always stored immediately before the array. */
9131             array_b--;
9132             len_b++;
9133         }
9134     }
9135
9136     /* Make sure that the lengths are the same, as well as the final element
9137      * before looping through the remainder.  (Thus we test the length, final,
9138      * and first elements right off the bat) */
9139     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9140         retval = FALSE;
9141     }
9142     else for (i = 0; i < len_a - 1; i++) {
9143         if (array_a[i] != array_b[i]) {
9144             retval = FALSE;
9145             break;
9146         }
9147     }
9148
9149     return retval;
9150 }
9151 #endif
9152
9153 #undef HEADER_LENGTH
9154 #undef TO_INTERNAL_SIZE
9155 #undef FROM_INTERNAL_SIZE
9156 #undef INVLIST_VERSION_ID
9157
9158 /* End of inversion list object */
9159
9160 STATIC void
9161 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9162 {
9163     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9164      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9165      * should point to the first flag; it is updated on output to point to the
9166      * final ')' or ':'.  There needs to be at least one flag, or this will
9167      * abort */
9168
9169     /* for (?g), (?gc), and (?o) warnings; warning
9170        about (?c) will warn about (?g) -- japhy    */
9171
9172 #define WASTED_O  0x01
9173 #define WASTED_G  0x02
9174 #define WASTED_C  0x04
9175 #define WASTED_GC (WASTED_G|WASTED_C)
9176     I32 wastedflags = 0x00;
9177     U32 posflags = 0, negflags = 0;
9178     U32 *flagsp = &posflags;
9179     char has_charset_modifier = '\0';
9180     regex_charset cs;
9181     bool has_use_defaults = FALSE;
9182     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9183
9184     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9185
9186     /* '^' as an initial flag sets certain defaults */
9187     if (UCHARAT(RExC_parse) == '^') {
9188         RExC_parse++;
9189         has_use_defaults = TRUE;
9190         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9191         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9192                                         ? REGEX_UNICODE_CHARSET
9193                                         : REGEX_DEPENDS_CHARSET);
9194     }
9195
9196     cs = get_regex_charset(RExC_flags);
9197     if (cs == REGEX_DEPENDS_CHARSET
9198         && (RExC_utf8 || RExC_uni_semantics))
9199     {
9200         cs = REGEX_UNICODE_CHARSET;
9201     }
9202
9203     while (*RExC_parse) {
9204         /* && strchr("iogcmsx", *RExC_parse) */
9205         /* (?g), (?gc) and (?o) are useless here
9206            and must be globally applied -- japhy */
9207         switch (*RExC_parse) {
9208
9209             /* Code for the imsx flags */
9210             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9211
9212             case LOCALE_PAT_MOD:
9213                 if (has_charset_modifier) {
9214                     goto excess_modifier;
9215                 }
9216                 else if (flagsp == &negflags) {
9217                     goto neg_modifier;
9218                 }
9219                 cs = REGEX_LOCALE_CHARSET;
9220                 has_charset_modifier = LOCALE_PAT_MOD;
9221                 break;
9222             case UNICODE_PAT_MOD:
9223                 if (has_charset_modifier) {
9224                     goto excess_modifier;
9225                 }
9226                 else if (flagsp == &negflags) {
9227                     goto neg_modifier;
9228                 }
9229                 cs = REGEX_UNICODE_CHARSET;
9230                 has_charset_modifier = UNICODE_PAT_MOD;
9231                 break;
9232             case ASCII_RESTRICT_PAT_MOD:
9233                 if (flagsp == &negflags) {
9234                     goto neg_modifier;
9235                 }
9236                 if (has_charset_modifier) {
9237                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9238                         goto excess_modifier;
9239                     }
9240                     /* Doubled modifier implies more restricted */
9241                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9242                 }
9243                 else {
9244                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9245                 }
9246                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9247                 break;
9248             case DEPENDS_PAT_MOD:
9249                 if (has_use_defaults) {
9250                     goto fail_modifiers;
9251                 }
9252                 else if (flagsp == &negflags) {
9253                     goto neg_modifier;
9254                 }
9255                 else if (has_charset_modifier) {
9256                     goto excess_modifier;
9257                 }
9258
9259                 /* The dual charset means unicode semantics if the
9260                  * pattern (or target, not known until runtime) are
9261                  * utf8, or something in the pattern indicates unicode
9262                  * semantics */
9263                 cs = (RExC_utf8 || RExC_uni_semantics)
9264                      ? REGEX_UNICODE_CHARSET
9265                      : REGEX_DEPENDS_CHARSET;
9266                 has_charset_modifier = DEPENDS_PAT_MOD;
9267                 break;
9268             excess_modifier:
9269                 RExC_parse++;
9270                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9271                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9272                 }
9273                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9274                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9275                                         *(RExC_parse - 1));
9276                 }
9277                 else {
9278                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9279                 }
9280                 /*NOTREACHED*/
9281             neg_modifier:
9282                 RExC_parse++;
9283                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9284                                     *(RExC_parse - 1));
9285                 /*NOTREACHED*/
9286             case ONCE_PAT_MOD: /* 'o' */
9287             case GLOBAL_PAT_MOD: /* 'g' */
9288                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9289                     const I32 wflagbit = *RExC_parse == 'o'
9290                                          ? WASTED_O
9291                                          : WASTED_G;
9292                     if (! (wastedflags & wflagbit) ) {
9293                         wastedflags |= wflagbit;
9294                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9295                         vWARN5(
9296                             RExC_parse + 1,
9297                             "Useless (%s%c) - %suse /%c modifier",
9298                             flagsp == &negflags ? "?-" : "?",
9299                             *RExC_parse,
9300                             flagsp == &negflags ? "don't " : "",
9301                             *RExC_parse
9302                         );
9303                     }
9304                 }
9305                 break;
9306
9307             case CONTINUE_PAT_MOD: /* 'c' */
9308                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9309                     if (! (wastedflags & WASTED_C) ) {
9310                         wastedflags |= WASTED_GC;
9311                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9312                         vWARN3(
9313                             RExC_parse + 1,
9314                             "Useless (%sc) - %suse /gc modifier",
9315                             flagsp == &negflags ? "?-" : "?",
9316                             flagsp == &negflags ? "don't " : ""
9317                         );
9318                     }
9319                 }
9320                 break;
9321             case KEEPCOPY_PAT_MOD: /* 'p' */
9322                 if (flagsp == &negflags) {
9323                     if (SIZE_ONLY)
9324                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9325                 } else {
9326                     *flagsp |= RXf_PMf_KEEPCOPY;
9327                 }
9328                 break;
9329             case '-':
9330                 /* A flag is a default iff it is following a minus, so
9331                  * if there is a minus, it means will be trying to
9332                  * re-specify a default which is an error */
9333                 if (has_use_defaults || flagsp == &negflags) {
9334                     goto fail_modifiers;
9335                 }
9336                 flagsp = &negflags;
9337                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9338                 break;
9339             case ':':
9340             case ')':
9341                 RExC_flags |= posflags;
9342                 RExC_flags &= ~negflags;
9343                 set_regex_charset(&RExC_flags, cs);
9344                 if (RExC_flags & RXf_PMf_FOLD) {
9345                     RExC_contains_i = 1;
9346                 }
9347                 return;
9348                 /*NOTREACHED*/
9349             default:
9350             fail_modifiers:
9351                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9352                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9353                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9354                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9355                 /*NOTREACHED*/
9356         }
9357
9358         ++RExC_parse;
9359     }
9360 }
9361
9362 /*
9363  - reg - regular expression, i.e. main body or parenthesized thing
9364  *
9365  * Caller must absorb opening parenthesis.
9366  *
9367  * Combining parenthesis handling with the base level of regular expression
9368  * is a trifle forced, but the need to tie the tails of the branches to what
9369  * follows makes it hard to avoid.
9370  */
9371 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9372 #ifdef DEBUGGING
9373 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9374 #else
9375 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9376 #endif
9377
9378 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9379    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9380    needs to be restarted.
9381    Otherwise would only return NULL if regbranch() returns NULL, which
9382    cannot happen.  */
9383 STATIC regnode *
9384 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9385     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9386      * 2 is like 1, but indicates that nextchar() has been called to advance
9387      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9388      * this flag alerts us to the need to check for that */
9389 {
9390     dVAR;
9391     regnode *ret;               /* Will be the head of the group. */
9392     regnode *br;
9393     regnode *lastbr;
9394     regnode *ender = NULL;
9395     I32 parno = 0;
9396     I32 flags;
9397     U32 oregflags = RExC_flags;
9398     bool have_branch = 0;
9399     bool is_open = 0;
9400     I32 freeze_paren = 0;
9401     I32 after_freeze = 0;
9402
9403     char * parse_start = RExC_parse; /* MJD */
9404     char * const oregcomp_parse = RExC_parse;
9405
9406     GET_RE_DEBUG_FLAGS_DECL;
9407
9408     PERL_ARGS_ASSERT_REG;
9409     DEBUG_PARSE("reg ");
9410
9411     *flagp = 0;                         /* Tentatively. */
9412
9413
9414     /* Make an OPEN node, if parenthesized. */
9415     if (paren) {
9416
9417         /* Under /x, space and comments can be gobbled up between the '(' and
9418          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9419          * intervening space, as the sequence is a token, and a token should be
9420          * indivisible */
9421         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9422
9423         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9424             char *start_verb = RExC_parse;
9425             STRLEN verb_len = 0;
9426             char *start_arg = NULL;
9427             unsigned char op = 0;
9428             int argok = 1;
9429             int internal_argval = 0; /* internal_argval is only useful if
9430                                         !argok */
9431
9432             if (has_intervening_patws) {
9433                 RExC_parse++;
9434                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9435             }
9436             while ( *RExC_parse && *RExC_parse != ')' ) {
9437                 if ( *RExC_parse == ':' ) {
9438                     start_arg = RExC_parse + 1;
9439                     break;
9440                 }
9441                 RExC_parse++;
9442             }
9443             ++start_verb;
9444             verb_len = RExC_parse - start_verb;
9445             if ( start_arg ) {
9446                 RExC_parse++;
9447                 while ( *RExC_parse && *RExC_parse != ')' )
9448                     RExC_parse++;
9449                 if ( *RExC_parse != ')' )
9450                     vFAIL("Unterminated verb pattern argument");
9451                 if ( RExC_parse == start_arg )
9452                     start_arg = NULL;
9453             } else {
9454                 if ( *RExC_parse != ')' )
9455                     vFAIL("Unterminated verb pattern");
9456             }
9457
9458             switch ( *start_verb ) {
9459             case 'A':  /* (*ACCEPT) */
9460                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9461                     op = ACCEPT;
9462                     internal_argval = RExC_nestroot;
9463                 }
9464                 break;
9465             case 'C':  /* (*COMMIT) */
9466                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9467                     op = COMMIT;
9468                 break;
9469             case 'F':  /* (*FAIL) */
9470                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9471                     op = OPFAIL;
9472                     argok = 0;
9473                 }
9474                 break;
9475             case ':':  /* (*:NAME) */
9476             case 'M':  /* (*MARK:NAME) */
9477                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9478                     op = MARKPOINT;
9479                     argok = -1;
9480                 }
9481                 break;
9482             case 'P':  /* (*PRUNE) */
9483                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9484                     op = PRUNE;
9485                 break;
9486             case 'S':   /* (*SKIP) */
9487                 if ( memEQs(start_verb,verb_len,"SKIP") )
9488                     op = SKIP;
9489                 break;
9490             case 'T':  /* (*THEN) */
9491                 /* [19:06] <TimToady> :: is then */
9492                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9493                     op = CUTGROUP;
9494                     RExC_seen |= REG_CUTGROUP_SEEN;
9495                 }
9496                 break;
9497             }
9498             if ( ! op ) {
9499                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9500                 vFAIL2utf8f(
9501                     "Unknown verb pattern '%"UTF8f"'",
9502                     UTF8fARG(UTF, verb_len, start_verb));
9503             }
9504             if ( argok ) {
9505                 if ( start_arg && internal_argval ) {
9506                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9507                         verb_len, start_verb);
9508                 } else if ( argok < 0 && !start_arg ) {
9509                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9510                         verb_len, start_verb);
9511                 } else {
9512                     ret = reganode(pRExC_state, op, internal_argval);
9513                     if ( ! internal_argval && ! SIZE_ONLY ) {
9514                         if (start_arg) {
9515                             SV *sv = newSVpvn( start_arg,
9516                                                RExC_parse - start_arg);
9517                             ARG(ret) = add_data( pRExC_state,
9518                                                  STR_WITH_LEN("S"));
9519                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9520                             ret->flags = 0;
9521                         } else {
9522                             ret->flags = 1;
9523                         }
9524                     }
9525                 }
9526                 if (!internal_argval)
9527                     RExC_seen |= REG_VERBARG_SEEN;
9528             } else if ( start_arg ) {
9529                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9530                         verb_len, start_verb);
9531             } else {
9532                 ret = reg_node(pRExC_state, op);
9533             }
9534             nextchar(pRExC_state);
9535             return ret;
9536         }
9537         else if (*RExC_parse == '?') { /* (?...) */
9538             bool is_logical = 0;
9539             const char * const seqstart = RExC_parse;
9540             if (has_intervening_patws) {
9541                 RExC_parse++;
9542                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9543             }
9544
9545             RExC_parse++;
9546             paren = *RExC_parse++;
9547             ret = NULL;                 /* For look-ahead/behind. */
9548             switch (paren) {
9549
9550             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9551                 paren = *RExC_parse++;
9552                 if ( paren == '<')         /* (?P<...>) named capture */
9553                     goto named_capture;
9554                 else if (paren == '>') {   /* (?P>name) named recursion */
9555                     goto named_recursion;
9556                 }
9557                 else if (paren == '=') {   /* (?P=...)  named backref */
9558                     /* this pretty much dupes the code for \k<NAME> in
9559                      * regatom(), if you change this make sure you change that
9560                      * */
9561                     char* name_start = RExC_parse;
9562                     U32 num = 0;
9563                     SV *sv_dat = reg_scan_name(pRExC_state,
9564                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9565                     if (RExC_parse == name_start || *RExC_parse != ')')
9566                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9567                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9568
9569                     if (!SIZE_ONLY) {
9570                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9571                         RExC_rxi->data->data[num]=(void*)sv_dat;
9572                         SvREFCNT_inc_simple_void(sv_dat);
9573                     }
9574                     RExC_sawback = 1;
9575                     ret = reganode(pRExC_state,
9576                                    ((! FOLD)
9577                                      ? NREF
9578                                      : (ASCII_FOLD_RESTRICTED)
9579                                        ? NREFFA
9580                                        : (AT_LEAST_UNI_SEMANTICS)
9581                                          ? NREFFU
9582                                          : (LOC)
9583                                            ? NREFFL
9584                                            : NREFF),
9585                                     num);
9586                     *flagp |= HASWIDTH;
9587
9588                     Set_Node_Offset(ret, parse_start+1);
9589                     Set_Node_Cur_Length(ret, parse_start);
9590
9591                     nextchar(pRExC_state);
9592                     return ret;
9593                 }
9594                 RExC_parse++;
9595                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9596                 vFAIL3("Sequence (%.*s...) not recognized",
9597                                 RExC_parse-seqstart, seqstart);
9598                 /*NOTREACHED*/
9599             case '<':           /* (?<...) */
9600                 if (*RExC_parse == '!')
9601                     paren = ',';
9602                 else if (*RExC_parse != '=')
9603               named_capture:
9604                 {               /* (?<...>) */
9605                     char *name_start;
9606                     SV *svname;
9607                     paren= '>';
9608             case '\'':          /* (?'...') */
9609                     name_start= RExC_parse;
9610                     svname = reg_scan_name(pRExC_state,
9611                         SIZE_ONLY    /* reverse test from the others */
9612                         ? REG_RSN_RETURN_NAME
9613                         : REG_RSN_RETURN_NULL);
9614                     if (RExC_parse == name_start || *RExC_parse != paren)
9615                         vFAIL2("Sequence (?%c... not terminated",
9616                             paren=='>' ? '<' : paren);
9617                     if (SIZE_ONLY) {
9618                         HE *he_str;
9619                         SV *sv_dat = NULL;
9620                         if (!svname) /* shouldn't happen */
9621                             Perl_croak(aTHX_
9622                                 "panic: reg_scan_name returned NULL");
9623                         if (!RExC_paren_names) {
9624                             RExC_paren_names= newHV();
9625                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9626 #ifdef DEBUGGING
9627                             RExC_paren_name_list= newAV();
9628                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9629 #endif
9630                         }
9631                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9632                         if ( he_str )
9633                             sv_dat = HeVAL(he_str);
9634                         if ( ! sv_dat ) {
9635                             /* croak baby croak */
9636                             Perl_croak(aTHX_
9637                                 "panic: paren_name hash element allocation failed");
9638                         } else if ( SvPOK(sv_dat) ) {
9639                             /* (?|...) can mean we have dupes so scan to check
9640                                its already been stored. Maybe a flag indicating
9641                                we are inside such a construct would be useful,
9642                                but the arrays are likely to be quite small, so
9643                                for now we punt -- dmq */
9644                             IV count = SvIV(sv_dat);
9645                             I32 *pv = (I32*)SvPVX(sv_dat);
9646                             IV i;
9647                             for ( i = 0 ; i < count ; i++ ) {
9648                                 if ( pv[i] == RExC_npar ) {
9649                                     count = 0;
9650                                     break;
9651                                 }
9652                             }
9653                             if ( count ) {
9654                                 pv = (I32*)SvGROW(sv_dat,
9655                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9656                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9657                                 pv[count] = RExC_npar;
9658                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9659                             }
9660                         } else {
9661                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9662                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9663                                                                 sizeof(I32));
9664                             SvIOK_on(sv_dat);
9665                             SvIV_set(sv_dat, 1);
9666                         }
9667 #ifdef DEBUGGING
9668                         /* Yes this does cause a memory leak in debugging Perls
9669                          * */
9670                         if (!av_store(RExC_paren_name_list,
9671                                       RExC_npar, SvREFCNT_inc(svname)))
9672                             SvREFCNT_dec_NN(svname);
9673 #endif
9674
9675                         /*sv_dump(sv_dat);*/
9676                     }
9677                     nextchar(pRExC_state);
9678                     paren = 1;
9679                     goto capturing_parens;
9680                 }
9681                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9682                 RExC_in_lookbehind++;
9683                 RExC_parse++;
9684                 /* FALLTHROUGH */
9685             case '=':           /* (?=...) */
9686                 RExC_seen_zerolen++;
9687                 break;
9688             case '!':           /* (?!...) */
9689                 RExC_seen_zerolen++;
9690                 if (*RExC_parse == ')') {
9691                     ret=reg_node(pRExC_state, OPFAIL);
9692                     nextchar(pRExC_state);
9693                     return ret;
9694                 }
9695                 break;
9696             case '|':           /* (?|...) */
9697                 /* branch reset, behave like a (?:...) except that
9698                    buffers in alternations share the same numbers */
9699                 paren = ':';
9700                 after_freeze = freeze_paren = RExC_npar;
9701                 break;
9702             case ':':           /* (?:...) */
9703             case '>':           /* (?>...) */
9704                 break;
9705             case '$':           /* (?$...) */
9706             case '@':           /* (?@...) */
9707                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9708                 break;
9709             case '0' :           /* (?0) */
9710             case 'R' :           /* (?R) */
9711                 if (*RExC_parse != ')')
9712                     FAIL("Sequence (?R) not terminated");
9713                 ret = reg_node(pRExC_state, GOSTART);
9714                     RExC_seen |= REG_GOSTART_SEEN;
9715                 *flagp |= POSTPONED;
9716                 nextchar(pRExC_state);
9717                 return ret;
9718                 /*notreached*/
9719             { /* named and numeric backreferences */
9720                 I32 num;
9721             case '&':            /* (?&NAME) */
9722                 parse_start = RExC_parse - 1;
9723               named_recursion:
9724                 {
9725                     SV *sv_dat = reg_scan_name(pRExC_state,
9726                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9727                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9728                 }
9729                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9730                     vFAIL("Sequence (?&... not terminated");
9731                 goto gen_recurse_regop;
9732                 assert(0); /* NOT REACHED */
9733             case '+':
9734                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9735                     RExC_parse++;
9736                     vFAIL("Illegal pattern");
9737                 }
9738                 goto parse_recursion;
9739                 /* NOT REACHED*/
9740             case '-': /* (?-1) */
9741                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9742                     RExC_parse--; /* rewind to let it be handled later */
9743                     goto parse_flags;
9744                 }
9745                 /* FALLTHROUGH */
9746             case '1': case '2': case '3': case '4': /* (?1) */
9747             case '5': case '6': case '7': case '8': case '9':
9748                 RExC_parse--;
9749               parse_recursion:
9750                 num = atoi(RExC_parse);
9751                 parse_start = RExC_parse - 1; /* MJD */
9752                 if (*RExC_parse == '-')
9753                     RExC_parse++;
9754                 while (isDIGIT(*RExC_parse))
9755                         RExC_parse++;
9756                 if (*RExC_parse!=')')
9757                     vFAIL("Expecting close bracket");
9758
9759               gen_recurse_regop:
9760                 if ( paren == '-' ) {
9761                     /*
9762                     Diagram of capture buffer numbering.
9763                     Top line is the normal capture buffer numbers
9764                     Bottom line is the negative indexing as from
9765                     the X (the (?-2))
9766
9767                     +   1 2    3 4 5 X          6 7
9768                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9769                     -   5 4    3 2 1 X          x x
9770
9771                     */
9772                     num = RExC_npar + num;
9773                     if (num < 1)  {
9774                         RExC_parse++;
9775                         vFAIL("Reference to nonexistent group");
9776                     }
9777                 } else if ( paren == '+' ) {
9778                     num = RExC_npar + num - 1;
9779                 }
9780
9781                 ret = reganode(pRExC_state, GOSUB, num);
9782                 if (!SIZE_ONLY) {
9783                     if (num > (I32)RExC_rx->nparens) {
9784                         RExC_parse++;
9785                         vFAIL("Reference to nonexistent group");
9786                     }
9787                     ARG2L_SET( ret, RExC_recurse_count++);
9788                     RExC_emit++;
9789                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9790                         "Recurse #%"UVuf" to %"IVdf"\n",
9791                               (UV)ARG(ret), (IV)ARG2L(ret)));
9792                 } else {
9793                     RExC_size++;
9794                 }
9795                     RExC_seen |= REG_RECURSE_SEEN;
9796                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9797                 Set_Node_Offset(ret, parse_start); /* MJD */
9798
9799                 *flagp |= POSTPONED;
9800                 nextchar(pRExC_state);
9801                 return ret;
9802             } /* named and numeric backreferences */
9803             assert(0); /* NOT REACHED */
9804
9805             case '?':           /* (??...) */
9806                 is_logical = 1;
9807                 if (*RExC_parse != '{') {
9808                     RExC_parse++;
9809                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9810                     vFAIL2utf8f(
9811                         "Sequence (%"UTF8f"...) not recognized",
9812                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9813                     /*NOTREACHED*/
9814                 }
9815                 *flagp |= POSTPONED;
9816                 paren = *RExC_parse++;
9817                 /* FALLTHROUGH */
9818             case '{':           /* (?{...}) */
9819             {
9820                 U32 n = 0;
9821                 struct reg_code_block *cb;
9822
9823                 RExC_seen_zerolen++;
9824
9825                 if (   !pRExC_state->num_code_blocks
9826                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9827                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9828                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9829                             - RExC_start)
9830                 ) {
9831                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9832                         FAIL("panic: Sequence (?{...}): no code block found\n");
9833                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9834                 }
9835                 /* this is a pre-compiled code block (?{...}) */
9836                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9837                 RExC_parse = RExC_start + cb->end;
9838                 if (!SIZE_ONLY) {
9839                     OP *o = cb->block;
9840                     if (cb->src_regex) {
9841                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9842                         RExC_rxi->data->data[n] =
9843                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9844                         RExC_rxi->data->data[n+1] = (void*)o;
9845                     }
9846                     else {
9847                         n = add_data(pRExC_state,
9848                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9849                         RExC_rxi->data->data[n] = (void*)o;
9850                     }
9851                 }
9852                 pRExC_state->code_index++;
9853                 nextchar(pRExC_state);
9854
9855                 if (is_logical) {
9856                     regnode *eval;
9857                     ret = reg_node(pRExC_state, LOGICAL);
9858                     eval = reganode(pRExC_state, EVAL, n);
9859                     if (!SIZE_ONLY) {
9860                         ret->flags = 2;
9861                         /* for later propagation into (??{}) return value */
9862                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9863                     }
9864                     REGTAIL(pRExC_state, ret, eval);
9865                     /* deal with the length of this later - MJD */
9866                     return ret;
9867                 }
9868                 ret = reganode(pRExC_state, EVAL, n);
9869                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9870                 Set_Node_Offset(ret, parse_start);
9871                 return ret;
9872             }
9873             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9874             {
9875                 int is_define= 0;
9876                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9877                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9878                         || RExC_parse[1] == '<'
9879                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9880                         I32 flag;
9881                         regnode *tail;
9882
9883                         ret = reg_node(pRExC_state, LOGICAL);
9884                         if (!SIZE_ONLY)
9885                             ret->flags = 1;
9886
9887                         tail = reg(pRExC_state, 1, &flag, depth+1);
9888                         if (flag & RESTART_UTF8) {
9889                             *flagp = RESTART_UTF8;
9890                             return NULL;
9891                         }
9892                         REGTAIL(pRExC_state, ret, tail);
9893                         goto insert_if;
9894                     }
9895                 }
9896                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9897                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9898                 {
9899                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9900                     char *name_start= RExC_parse++;
9901                     U32 num = 0;
9902                     SV *sv_dat=reg_scan_name(pRExC_state,
9903                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9904                     if (RExC_parse == name_start || *RExC_parse != ch)
9905                         vFAIL2("Sequence (?(%c... not terminated",
9906                             (ch == '>' ? '<' : ch));
9907                     RExC_parse++;
9908                     if (!SIZE_ONLY) {
9909                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9910                         RExC_rxi->data->data[num]=(void*)sv_dat;
9911                         SvREFCNT_inc_simple_void(sv_dat);
9912                     }
9913                     ret = reganode(pRExC_state,NGROUPP,num);
9914                     goto insert_if_check_paren;
9915                 }
9916                 else if (RExC_parse[0] == 'D' &&
9917                          RExC_parse[1] == 'E' &&
9918                          RExC_parse[2] == 'F' &&
9919                          RExC_parse[3] == 'I' &&
9920                          RExC_parse[4] == 'N' &&
9921                          RExC_parse[5] == 'E')
9922                 {
9923                     ret = reganode(pRExC_state,DEFINEP,0);
9924                     RExC_parse +=6 ;
9925                     is_define = 1;
9926                     goto insert_if_check_paren;
9927                 }
9928                 else if (RExC_parse[0] == 'R') {
9929                     RExC_parse++;
9930                     parno = 0;
9931                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9932                         parno = atoi(RExC_parse++);
9933                         while (isDIGIT(*RExC_parse))
9934                             RExC_parse++;
9935                     } else if (RExC_parse[0] == '&') {
9936                         SV *sv_dat;
9937                         RExC_parse++;
9938                         sv_dat = reg_scan_name(pRExC_state,
9939                             SIZE_ONLY
9940                             ? REG_RSN_RETURN_NULL
9941                             : REG_RSN_RETURN_DATA);
9942                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9943                     }
9944                     ret = reganode(pRExC_state,INSUBP,parno);
9945                     goto insert_if_check_paren;
9946                 }
9947                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9948                     /* (?(1)...) */
9949                     char c;
9950                     char *tmp;
9951                     parno = atoi(RExC_parse++);
9952
9953                     while (isDIGIT(*RExC_parse))
9954                         RExC_parse++;
9955                     ret = reganode(pRExC_state, GROUPP, parno);
9956
9957                  insert_if_check_paren:
9958                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9959                         /* nextchar also skips comments, so undo its work
9960                          * and skip over the the next character.
9961                          */
9962                         RExC_parse = tmp;
9963                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9964                         vFAIL("Switch condition not recognized");
9965                     }
9966                   insert_if:
9967                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9968                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9969                     if (br == NULL) {
9970                         if (flags & RESTART_UTF8) {
9971                             *flagp = RESTART_UTF8;
9972                             return NULL;
9973                         }
9974                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9975                               (UV) flags);
9976                     } else
9977                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9978                                                           LONGJMP, 0));
9979                     c = *nextchar(pRExC_state);
9980                     if (flags&HASWIDTH)
9981                         *flagp |= HASWIDTH;
9982                     if (c == '|') {
9983                         if (is_define)
9984                             vFAIL("(?(DEFINE)....) does not allow branches");
9985
9986                         /* Fake one for optimizer.  */
9987                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9988
9989                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9990                             if (flags & RESTART_UTF8) {
9991                                 *flagp = RESTART_UTF8;
9992                                 return NULL;
9993                             }
9994                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9995                                   (UV) flags);
9996                         }
9997                         REGTAIL(pRExC_state, ret, lastbr);
9998                         if (flags&HASWIDTH)
9999                             *flagp |= HASWIDTH;
10000                         c = *nextchar(pRExC_state);
10001                     }
10002                     else
10003                         lastbr = NULL;
10004                     if (c != ')')
10005                         vFAIL("Switch (?(condition)... contains too many branches");
10006                     ender = reg_node(pRExC_state, TAIL);
10007                     REGTAIL(pRExC_state, br, ender);
10008                     if (lastbr) {
10009                         REGTAIL(pRExC_state, lastbr, ender);
10010                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10011                     }
10012                     else
10013                         REGTAIL(pRExC_state, ret, ender);
10014                     RExC_size++; /* XXX WHY do we need this?!!
10015                                     For large programs it seems to be required
10016                                     but I can't figure out why. -- dmq*/
10017                     return ret;
10018                 }
10019                 else {
10020                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10021                     vFAIL("Unknown switch condition (?(...))");
10022                 }
10023             }
10024             case '[':           /* (?[ ... ]) */
10025                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10026                                          oregcomp_parse);
10027             case 0:
10028                 RExC_parse--; /* for vFAIL to print correctly */
10029                 vFAIL("Sequence (? incomplete");
10030                 break;
10031             default: /* e.g., (?i) */
10032                 --RExC_parse;
10033               parse_flags:
10034                 parse_lparen_question_flags(pRExC_state);
10035                 if (UCHARAT(RExC_parse) != ':') {
10036                     nextchar(pRExC_state);
10037                     *flagp = TRYAGAIN;
10038                     return NULL;
10039                 }
10040                 paren = ':';
10041                 nextchar(pRExC_state);
10042                 ret = NULL;
10043                 goto parse_rest;
10044             } /* end switch */
10045         }
10046         else {                  /* (...) */
10047           capturing_parens:
10048             parno = RExC_npar;
10049             RExC_npar++;
10050
10051             ret = reganode(pRExC_state, OPEN, parno);
10052             if (!SIZE_ONLY ){
10053                 if (!RExC_nestroot)
10054                     RExC_nestroot = parno;
10055                 if (RExC_seen & REG_RECURSE_SEEN
10056                     && !RExC_open_parens[parno-1])
10057                 {
10058                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10059                         "Setting open paren #%"IVdf" to %d\n",
10060                         (IV)parno, REG_NODE_NUM(ret)));
10061                     RExC_open_parens[parno-1]= ret;
10062                 }
10063             }
10064             Set_Node_Length(ret, 1); /* MJD */
10065             Set_Node_Offset(ret, RExC_parse); /* MJD */
10066             is_open = 1;
10067         }
10068     }
10069     else                        /* ! paren */
10070         ret = NULL;
10071
10072    parse_rest:
10073     /* Pick up the branches, linking them together. */
10074     parse_start = RExC_parse;   /* MJD */
10075     br = regbranch(pRExC_state, &flags, 1,depth+1);
10076
10077     /*     branch_len = (paren != 0); */
10078
10079     if (br == NULL) {
10080         if (flags & RESTART_UTF8) {
10081             *flagp = RESTART_UTF8;
10082             return NULL;
10083         }
10084         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10085     }
10086     if (*RExC_parse == '|') {
10087         if (!SIZE_ONLY && RExC_extralen) {
10088             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10089         }
10090         else {                  /* MJD */
10091             reginsert(pRExC_state, BRANCH, br, depth+1);
10092             Set_Node_Length(br, paren != 0);
10093             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10094         }
10095         have_branch = 1;
10096         if (SIZE_ONLY)
10097             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10098     }
10099     else if (paren == ':') {
10100         *flagp |= flags&SIMPLE;
10101     }
10102     if (is_open) {                              /* Starts with OPEN. */
10103         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10104     }
10105     else if (paren != '?')              /* Not Conditional */
10106         ret = br;
10107     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10108     lastbr = br;
10109     while (*RExC_parse == '|') {
10110         if (!SIZE_ONLY && RExC_extralen) {
10111             ender = reganode(pRExC_state, LONGJMP,0);
10112
10113             /* Append to the previous. */
10114             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10115         }
10116         if (SIZE_ONLY)
10117             RExC_extralen += 2;         /* Account for LONGJMP. */
10118         nextchar(pRExC_state);
10119         if (freeze_paren) {
10120             if (RExC_npar > after_freeze)
10121                 after_freeze = RExC_npar;
10122             RExC_npar = freeze_paren;
10123         }
10124         br = regbranch(pRExC_state, &flags, 0, depth+1);
10125
10126         if (br == NULL) {
10127             if (flags & RESTART_UTF8) {
10128                 *flagp = RESTART_UTF8;
10129                 return NULL;
10130             }
10131             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10132         }
10133         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10134         lastbr = br;
10135         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10136     }
10137
10138     if (have_branch || paren != ':') {
10139         /* Make a closing node, and hook it on the end. */
10140         switch (paren) {
10141         case ':':
10142             ender = reg_node(pRExC_state, TAIL);
10143             break;
10144         case 1: case 2:
10145             ender = reganode(pRExC_state, CLOSE, parno);
10146             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10147                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10148                         "Setting close paren #%"IVdf" to %d\n",
10149                         (IV)parno, REG_NODE_NUM(ender)));
10150                 RExC_close_parens[parno-1]= ender;
10151                 if (RExC_nestroot == parno)
10152                     RExC_nestroot = 0;
10153             }
10154             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10155             Set_Node_Length(ender,1); /* MJD */
10156             break;
10157         case '<':
10158         case ',':
10159         case '=':
10160         case '!':
10161             *flagp &= ~HASWIDTH;
10162             /* FALLTHROUGH */
10163         case '>':
10164             ender = reg_node(pRExC_state, SUCCEED);
10165             break;
10166         case 0:
10167             ender = reg_node(pRExC_state, END);
10168             if (!SIZE_ONLY) {
10169                 assert(!RExC_opend); /* there can only be one! */
10170                 RExC_opend = ender;
10171             }
10172             break;
10173         }
10174         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10175             SV * const mysv_val1=sv_newmortal();
10176             SV * const mysv_val2=sv_newmortal();
10177             DEBUG_PARSE_MSG("lsbr");
10178             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10179             regprop(RExC_rx, mysv_val2, ender, NULL);
10180             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10181                           SvPV_nolen_const(mysv_val1),
10182                           (IV)REG_NODE_NUM(lastbr),
10183                           SvPV_nolen_const(mysv_val2),
10184                           (IV)REG_NODE_NUM(ender),
10185                           (IV)(ender - lastbr)
10186             );
10187         });
10188         REGTAIL(pRExC_state, lastbr, ender);
10189
10190         if (have_branch && !SIZE_ONLY) {
10191             char is_nothing= 1;
10192             if (depth==1)
10193                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10194
10195             /* Hook the tails of the branches to the closing node. */
10196             for (br = ret; br; br = regnext(br)) {
10197                 const U8 op = PL_regkind[OP(br)];
10198                 if (op == BRANCH) {
10199                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10200                     if ( OP(NEXTOPER(br)) != NOTHING
10201                          || regnext(NEXTOPER(br)) != ender)
10202                         is_nothing= 0;
10203                 }
10204                 else if (op == BRANCHJ) {
10205                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10206                     /* for now we always disable this optimisation * /
10207                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10208                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10209                     */
10210                         is_nothing= 0;
10211                 }
10212             }
10213             if (is_nothing) {
10214                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10215                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10216                     SV * const mysv_val1=sv_newmortal();
10217                     SV * const mysv_val2=sv_newmortal();
10218                     DEBUG_PARSE_MSG("NADA");
10219                     regprop(RExC_rx, mysv_val1, ret, NULL);
10220                     regprop(RExC_rx, mysv_val2, ender, NULL);
10221                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10222                                   SvPV_nolen_const(mysv_val1),
10223                                   (IV)REG_NODE_NUM(ret),
10224                                   SvPV_nolen_const(mysv_val2),
10225                                   (IV)REG_NODE_NUM(ender),
10226                                   (IV)(ender - ret)
10227                     );
10228                 });
10229                 OP(br)= NOTHING;
10230                 if (OP(ender) == TAIL) {
10231                     NEXT_OFF(br)= 0;
10232                     RExC_emit= br + 1;
10233                 } else {
10234                     regnode *opt;
10235                     for ( opt= br + 1; opt < ender ; opt++ )
10236                         OP(opt)= OPTIMIZED;
10237                     NEXT_OFF(br)= ender - br;
10238                 }
10239             }
10240         }
10241     }
10242
10243     {
10244         const char *p;
10245         static const char parens[] = "=!<,>";
10246
10247         if (paren && (p = strchr(parens, paren))) {
10248             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10249             int flag = (p - parens) > 1;
10250
10251             if (paren == '>')
10252                 node = SUSPEND, flag = 0;
10253             reginsert(pRExC_state, node,ret, depth+1);
10254             Set_Node_Cur_Length(ret, parse_start);
10255             Set_Node_Offset(ret, parse_start + 1);
10256             ret->flags = flag;
10257             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10258         }
10259     }
10260
10261     /* Check for proper termination. */
10262     if (paren) {
10263         /* restore original flags, but keep (?p) */
10264         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10265         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10266             RExC_parse = oregcomp_parse;
10267             vFAIL("Unmatched (");
10268         }
10269     }
10270     else if (!paren && RExC_parse < RExC_end) {
10271         if (*RExC_parse == ')') {
10272             RExC_parse++;
10273             vFAIL("Unmatched )");
10274         }
10275         else
10276             FAIL("Junk on end of regexp");      /* "Can't happen". */
10277         assert(0); /* NOTREACHED */
10278     }
10279
10280     if (RExC_in_lookbehind) {
10281         RExC_in_lookbehind--;
10282     }
10283     if (after_freeze > RExC_npar)
10284         RExC_npar = after_freeze;
10285     return(ret);
10286 }
10287
10288 /*
10289  - regbranch - one alternative of an | operator
10290  *
10291  * Implements the concatenation operator.
10292  *
10293  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10294  * restarted.
10295  */
10296 STATIC regnode *
10297 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10298 {
10299     dVAR;
10300     regnode *ret;
10301     regnode *chain = NULL;
10302     regnode *latest;
10303     I32 flags = 0, c = 0;
10304     GET_RE_DEBUG_FLAGS_DECL;
10305
10306     PERL_ARGS_ASSERT_REGBRANCH;
10307
10308     DEBUG_PARSE("brnc");
10309
10310     if (first)
10311         ret = NULL;
10312     else {
10313         if (!SIZE_ONLY && RExC_extralen)
10314             ret = reganode(pRExC_state, BRANCHJ,0);
10315         else {
10316             ret = reg_node(pRExC_state, BRANCH);
10317             Set_Node_Length(ret, 1);
10318         }
10319     }
10320
10321     if (!first && SIZE_ONLY)
10322         RExC_extralen += 1;                     /* BRANCHJ */
10323
10324     *flagp = WORST;                     /* Tentatively. */
10325
10326     RExC_parse--;
10327     nextchar(pRExC_state);
10328     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10329         flags &= ~TRYAGAIN;
10330         latest = regpiece(pRExC_state, &flags,depth+1);
10331         if (latest == NULL) {
10332             if (flags & TRYAGAIN)
10333                 continue;
10334             if (flags & RESTART_UTF8) {
10335                 *flagp = RESTART_UTF8;
10336                 return NULL;
10337             }
10338             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10339         }
10340         else if (ret == NULL)
10341             ret = latest;
10342         *flagp |= flags&(HASWIDTH|POSTPONED);
10343         if (chain == NULL)      /* First piece. */
10344             *flagp |= flags&SPSTART;
10345         else {
10346             RExC_naughty++;
10347             REGTAIL(pRExC_state, chain, latest);
10348         }
10349         chain = latest;
10350         c++;
10351     }
10352     if (chain == NULL) {        /* Loop ran zero times. */
10353         chain = reg_node(pRExC_state, NOTHING);
10354         if (ret == NULL)
10355             ret = chain;
10356     }
10357     if (c == 1) {
10358         *flagp |= flags&SIMPLE;
10359     }
10360
10361     return ret;
10362 }
10363
10364 /*
10365  - regpiece - something followed by possible [*+?]
10366  *
10367  * Note that the branching code sequences used for ? and the general cases
10368  * of * and + are somewhat optimized:  they use the same NOTHING node as
10369  * both the endmarker for their branch list and the body of the last branch.
10370  * It might seem that this node could be dispensed with entirely, but the
10371  * endmarker role is not redundant.
10372  *
10373  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10374  * TRYAGAIN.
10375  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10376  * restarted.
10377  */
10378 STATIC regnode *
10379 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10380 {
10381     dVAR;
10382     regnode *ret;
10383     char op;
10384     char *next;
10385     I32 flags;
10386     const char * const origparse = RExC_parse;
10387     I32 min;
10388     I32 max = REG_INFTY;
10389 #ifdef RE_TRACK_PATTERN_OFFSETS
10390     char *parse_start;
10391 #endif
10392     const char *maxpos = NULL;
10393
10394     /* Save the original in case we change the emitted regop to a FAIL. */
10395     regnode * const orig_emit = RExC_emit;
10396
10397     GET_RE_DEBUG_FLAGS_DECL;
10398
10399     PERL_ARGS_ASSERT_REGPIECE;
10400
10401     DEBUG_PARSE("piec");
10402
10403     ret = regatom(pRExC_state, &flags,depth+1);
10404     if (ret == NULL) {
10405         if (flags & (TRYAGAIN|RESTART_UTF8))
10406             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10407         else
10408             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10409         return(NULL);
10410     }
10411
10412     op = *RExC_parse;
10413
10414     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10415         maxpos = NULL;
10416 #ifdef RE_TRACK_PATTERN_OFFSETS
10417         parse_start = RExC_parse; /* MJD */
10418 #endif
10419         next = RExC_parse + 1;
10420         while (isDIGIT(*next) || *next == ',') {
10421             if (*next == ',') {
10422                 if (maxpos)
10423                     break;
10424                 else
10425                     maxpos = next;
10426             }
10427             next++;
10428         }
10429         if (*next == '}') {             /* got one */
10430             if (!maxpos)
10431                 maxpos = next;
10432             RExC_parse++;
10433             min = atoi(RExC_parse);
10434             if (*maxpos == ',')
10435                 maxpos++;
10436             else
10437                 maxpos = RExC_parse;
10438             max = atoi(maxpos);
10439             if (!max && *maxpos != '0')
10440                 max = REG_INFTY;                /* meaning "infinity" */
10441             else if (max >= REG_INFTY)
10442                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10443             RExC_parse = next;
10444             nextchar(pRExC_state);
10445             if (max < min) {    /* If can't match, warn and optimize to fail
10446                                    unconditionally */
10447                 if (SIZE_ONLY) {
10448                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10449
10450                     /* We can't back off the size because we have to reserve
10451                      * enough space for all the things we are about to throw
10452                      * away, but we can shrink it by the ammount we are about
10453                      * to re-use here */
10454                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10455                 }
10456                 else {
10457                     RExC_emit = orig_emit;
10458                 }
10459                 ret = reg_node(pRExC_state, OPFAIL);
10460                 return ret;
10461             }
10462             else if (min == max
10463                      && RExC_parse < RExC_end
10464                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10465             {
10466                 if (SIZE_ONLY) {
10467                     ckWARN2reg(RExC_parse + 1,
10468                                "Useless use of greediness modifier '%c'",
10469                                *RExC_parse);
10470                 }
10471                 /* Absorb the modifier, so later code doesn't see nor use
10472                     * it */
10473                 nextchar(pRExC_state);
10474             }
10475
10476         do_curly:
10477             if ((flags&SIMPLE)) {
10478                 RExC_naughty += 2 + RExC_naughty / 2;
10479                 reginsert(pRExC_state, CURLY, ret, depth+1);
10480                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10481                 Set_Node_Cur_Length(ret, parse_start);
10482             }
10483             else {
10484                 regnode * const w = reg_node(pRExC_state, WHILEM);
10485
10486                 w->flags = 0;
10487                 REGTAIL(pRExC_state, ret, w);
10488                 if (!SIZE_ONLY && RExC_extralen) {
10489                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10490                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10491                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10492                 }
10493                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10494                                 /* MJD hk */
10495                 Set_Node_Offset(ret, parse_start+1);
10496                 Set_Node_Length(ret,
10497                                 op == '{' ? (RExC_parse - parse_start) : 1);
10498
10499                 if (!SIZE_ONLY && RExC_extralen)
10500                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10501                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10502                 if (SIZE_ONLY)
10503                     RExC_whilem_seen++, RExC_extralen += 3;
10504                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10505             }
10506             ret->flags = 0;
10507
10508             if (min > 0)
10509                 *flagp = WORST;
10510             if (max > 0)
10511                 *flagp |= HASWIDTH;
10512             if (!SIZE_ONLY) {
10513                 ARG1_SET(ret, (U16)min);
10514                 ARG2_SET(ret, (U16)max);
10515             }
10516             if (max == REG_INFTY)
10517                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10518
10519             goto nest_check;
10520         }
10521     }
10522
10523     if (!ISMULT1(op)) {
10524         *flagp = flags;
10525         return(ret);
10526     }
10527
10528 #if 0                           /* Now runtime fix should be reliable. */
10529
10530     /* if this is reinstated, don't forget to put this back into perldiag:
10531
10532             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10533
10534            (F) The part of the regexp subject to either the * or + quantifier
10535            could match an empty string. The {#} shows in the regular
10536            expression about where the problem was discovered.
10537
10538     */
10539
10540     if (!(flags&HASWIDTH) && op != '?')
10541       vFAIL("Regexp *+ operand could be empty");
10542 #endif
10543
10544 #ifdef RE_TRACK_PATTERN_OFFSETS
10545     parse_start = RExC_parse;
10546 #endif
10547     nextchar(pRExC_state);
10548
10549     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10550
10551     if (op == '*' && (flags&SIMPLE)) {
10552         reginsert(pRExC_state, STAR, ret, depth+1);
10553         ret->flags = 0;
10554         RExC_naughty += 4;
10555         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10556     }
10557     else if (op == '*') {
10558         min = 0;
10559         goto do_curly;
10560     }
10561     else if (op == '+' && (flags&SIMPLE)) {
10562         reginsert(pRExC_state, PLUS, ret, depth+1);
10563         ret->flags = 0;
10564         RExC_naughty += 3;
10565         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10566     }
10567     else if (op == '+') {
10568         min = 1;
10569         goto do_curly;
10570     }
10571     else if (op == '?') {
10572         min = 0; max = 1;
10573         goto do_curly;
10574     }
10575   nest_check:
10576     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10577         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10578         ckWARN2reg(RExC_parse,
10579                    "%"UTF8f" matches null string many times",
10580                    UTF8fARG(UTF, (RExC_parse >= origparse
10581                                  ? RExC_parse - origparse
10582                                  : 0),
10583                    origparse));
10584         (void)ReREFCNT_inc(RExC_rx_sv);
10585     }
10586
10587     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10588         nextchar(pRExC_state);
10589         reginsert(pRExC_state, MINMOD, ret, depth+1);
10590         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10591     }
10592     else
10593     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10594         regnode *ender;
10595         nextchar(pRExC_state);
10596         ender = reg_node(pRExC_state, SUCCEED);
10597         REGTAIL(pRExC_state, ret, ender);
10598         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10599         ret->flags = 0;
10600         ender = reg_node(pRExC_state, TAIL);
10601         REGTAIL(pRExC_state, ret, ender);
10602     }
10603
10604     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10605         RExC_parse++;
10606         vFAIL("Nested quantifiers");
10607     }
10608
10609     return(ret);
10610 }
10611
10612 STATIC bool
10613 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10614                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10615                       const bool strict   /* Apply stricter parsing rules? */
10616     )
10617 {
10618
10619  /* This is expected to be called by a parser routine that has recognized '\N'
10620    and needs to handle the rest. RExC_parse is expected to point at the first
10621    char following the N at the time of the call.  On successful return,
10622    RExC_parse has been updated to point to just after the sequence identified
10623    by this routine, and <*flagp> has been updated.
10624
10625    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10626    character class.
10627
10628    \N may begin either a named sequence, or if outside a character class, mean
10629    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10630    attempted to decide which, and in the case of a named sequence, converted it
10631    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10632    where c1... are the characters in the sequence.  For single-quoted regexes,
10633    the tokenizer passes the \N sequence through unchanged; this code will not
10634    attempt to determine this nor expand those, instead raising a syntax error.
10635    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10636    or there is no '}', it signals that this \N occurrence means to match a
10637    non-newline.
10638
10639    Only the \N{U+...} form should occur in a character class, for the same
10640    reason that '.' inside a character class means to just match a period: it
10641    just doesn't make sense.
10642
10643    The function raises an error (via vFAIL), and doesn't return for various
10644    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10645    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10646    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10647    only possible if node_p is non-NULL.
10648
10649
10650    If <valuep> is non-null, it means the caller can accept an input sequence
10651    consisting of a just a single code point; <*valuep> is set to that value
10652    if the input is such.
10653
10654    If <node_p> is non-null it signifies that the caller can accept any other
10655    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10656    is set as follows:
10657     1) \N means not-a-NL: points to a newly created REG_ANY node;
10658     2) \N{}:              points to a new NOTHING node;
10659     3) otherwise:         points to a new EXACT node containing the resolved
10660                           string.
10661    Note that FALSE is returned for single code point sequences if <valuep> is
10662    null.
10663  */
10664
10665     char * endbrace;    /* '}' following the name */
10666     char* p;
10667     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10668                            stream */
10669     bool has_multiple_chars; /* true if the input stream contains a sequence of
10670                                 more than one character */
10671
10672     GET_RE_DEBUG_FLAGS_DECL;
10673
10674     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10675
10676     GET_RE_DEBUG_FLAGS;
10677
10678     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10679
10680     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10681      * modifier.  The other meaning does not, so use a temporary until we find
10682      * out which we are being called with */
10683     p = (RExC_flags & RXf_PMf_EXTENDED)
10684         ? regpatws(pRExC_state, RExC_parse,
10685                                 TRUE) /* means recognize comments */
10686         : RExC_parse;
10687
10688     /* Disambiguate between \N meaning a named character versus \N meaning
10689      * [^\n].  The former is assumed when it can't be the latter. */
10690     if (*p != '{' || regcurly(p, FALSE)) {
10691         RExC_parse = p;
10692         if (! node_p) {
10693             /* no bare \N allowed in a charclass */
10694             if (in_char_class) {
10695                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10696             }
10697             return FALSE;
10698         }
10699         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10700                            current char */
10701         nextchar(pRExC_state);
10702         *node_p = reg_node(pRExC_state, REG_ANY);
10703         *flagp |= HASWIDTH|SIMPLE;
10704         RExC_naughty++;
10705         Set_Node_Length(*node_p, 1); /* MJD */
10706         return TRUE;
10707     }
10708
10709     /* Here, we have decided it should be a named character or sequence */
10710
10711     /* The test above made sure that the next real character is a '{', but
10712      * under the /x modifier, it could be separated by space (or a comment and
10713      * \n) and this is not allowed (for consistency with \x{...} and the
10714      * tokenizer handling of \N{NAME}). */
10715     if (*RExC_parse != '{') {
10716         vFAIL("Missing braces on \\N{}");
10717     }
10718
10719     RExC_parse++;       /* Skip past the '{' */
10720
10721     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10722         || ! (endbrace == RExC_parse            /* nothing between the {} */
10723               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10724                                                  */
10725                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10726                                                      */
10727     {
10728         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10729         vFAIL("\\N{NAME} must be resolved by the lexer");
10730     }
10731
10732     if (endbrace == RExC_parse) {   /* empty: \N{} */
10733         bool ret = TRUE;
10734         if (node_p) {
10735             *node_p = reg_node(pRExC_state,NOTHING);
10736         }
10737         else if (in_char_class) {
10738             if (SIZE_ONLY && in_char_class) {
10739                 if (strict) {
10740                     RExC_parse++;   /* Position after the "}" */
10741                     vFAIL("Zero length \\N{}");
10742                 }
10743                 else {
10744                     ckWARNreg(RExC_parse,
10745                               "Ignoring zero length \\N{} in character class");
10746                 }
10747             }
10748             ret = FALSE;
10749         }
10750         else {
10751             return FALSE;
10752         }
10753         nextchar(pRExC_state);
10754         return ret;
10755     }
10756
10757     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10758     RExC_parse += 2;    /* Skip past the 'U+' */
10759
10760     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10761
10762     /* Code points are separated by dots.  If none, there is only one code
10763      * point, and is terminated by the brace */
10764     has_multiple_chars = (endchar < endbrace);
10765
10766     if (valuep && (! has_multiple_chars || in_char_class)) {
10767         /* We only pay attention to the first char of
10768         multichar strings being returned in char classes. I kinda wonder
10769         if this makes sense as it does change the behaviour
10770         from earlier versions, OTOH that behaviour was broken
10771         as well. XXX Solution is to recharacterize as
10772         [rest-of-class]|multi1|multi2... */
10773
10774         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10775         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10776             | PERL_SCAN_DISALLOW_PREFIX
10777             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10778
10779         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10780
10781         /* The tokenizer should have guaranteed validity, but it's possible to
10782          * bypass it by using single quoting, so check */
10783         if (length_of_hex == 0
10784             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10785         {
10786             RExC_parse += length_of_hex;        /* Includes all the valid */
10787             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10788                             ? UTF8SKIP(RExC_parse)
10789                             : 1;
10790             /* Guard against malformed utf8 */
10791             if (RExC_parse >= endchar) {
10792                 RExC_parse = endchar;
10793             }
10794             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10795         }
10796
10797         if (in_char_class && has_multiple_chars) {
10798             if (strict) {
10799                 RExC_parse = endbrace;
10800                 vFAIL("\\N{} in character class restricted to one character");
10801             }
10802             else {
10803                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10804             }
10805         }
10806
10807         RExC_parse = endbrace + 1;
10808     }
10809     else if (! node_p || ! has_multiple_chars) {
10810
10811         /* Here, the input is legal, but not according to the caller's
10812          * options.  We fail without advancing the parse, so that the
10813          * caller can try again */
10814         RExC_parse = p;
10815         return FALSE;
10816     }
10817     else {
10818
10819         /* What is done here is to convert this to a sub-pattern of the form
10820          * (?:\x{char1}\x{char2}...)
10821          * and then call reg recursively.  That way, it retains its atomicness,
10822          * while not having to worry about special handling that some code
10823          * points may have.  toke.c has converted the original Unicode values
10824          * to native, so that we can just pass on the hex values unchanged.  We
10825          * do have to set a flag to keep recoding from happening in the
10826          * recursion */
10827
10828         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10829         STRLEN len;
10830         char *orig_end = RExC_end;
10831         I32 flags;
10832
10833         while (RExC_parse < endbrace) {
10834
10835             /* Convert to notation the rest of the code understands */
10836             sv_catpv(substitute_parse, "\\x{");
10837             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10838             sv_catpv(substitute_parse, "}");
10839
10840             /* Point to the beginning of the next character in the sequence. */
10841             RExC_parse = endchar + 1;
10842             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10843         }
10844         sv_catpv(substitute_parse, ")");
10845
10846         RExC_parse = SvPV(substitute_parse, len);
10847
10848         /* Don't allow empty number */
10849         if (len < 8) {
10850             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10851         }
10852         RExC_end = RExC_parse + len;
10853
10854         /* The values are Unicode, and therefore not subject to recoding */
10855         RExC_override_recoding = 1;
10856
10857         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10858             if (flags & RESTART_UTF8) {
10859                 *flagp = RESTART_UTF8;
10860                 return FALSE;
10861             }
10862             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10863                   (UV) flags);
10864         }
10865         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10866
10867         RExC_parse = endbrace;
10868         RExC_end = orig_end;
10869         RExC_override_recoding = 0;
10870
10871         nextchar(pRExC_state);
10872     }
10873
10874     return TRUE;
10875 }
10876
10877
10878 /*
10879  * reg_recode
10880  *
10881  * It returns the code point in utf8 for the value in *encp.
10882  *    value: a code value in the source encoding
10883  *    encp:  a pointer to an Encode object
10884  *
10885  * If the result from Encode is not a single character,
10886  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10887  */
10888 STATIC UV
10889 S_reg_recode(pTHX_ const char value, SV **encp)
10890 {
10891     STRLEN numlen = 1;
10892     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10893     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10894     const STRLEN newlen = SvCUR(sv);
10895     UV uv = UNICODE_REPLACEMENT;
10896
10897     PERL_ARGS_ASSERT_REG_RECODE;
10898
10899     if (newlen)
10900         uv = SvUTF8(sv)
10901              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10902              : *(U8*)s;
10903
10904     if (!newlen || numlen != newlen) {
10905         uv = UNICODE_REPLACEMENT;
10906         *encp = NULL;
10907     }
10908     return uv;
10909 }
10910
10911 PERL_STATIC_INLINE U8
10912 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10913 {
10914     U8 op;
10915
10916     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10917
10918     if (! FOLD) {
10919         return EXACT;
10920     }
10921
10922     op = get_regex_charset(RExC_flags);
10923     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10924         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10925                  been, so there is no hole */
10926     }
10927
10928     return op + EXACTF;
10929 }
10930
10931 PERL_STATIC_INLINE void
10932 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10933                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10934                          bool downgradable)
10935 {
10936     /* This knows the details about sizing an EXACTish node, setting flags for
10937      * it (by setting <*flagp>, and potentially populating it with a single
10938      * character.
10939      *
10940      * If <len> (the length in bytes) is non-zero, this function assumes that
10941      * the node has already been populated, and just does the sizing.  In this
10942      * case <code_point> should be the final code point that has already been
10943      * placed into the node.  This value will be ignored except that under some
10944      * circumstances <*flagp> is set based on it.
10945      *
10946      * If <len> is zero, the function assumes that the node is to contain only
10947      * the single character given by <code_point> and calculates what <len>
10948      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10949      * additionally will populate the node's STRING with <code_point> or its
10950      * fold if folding.
10951      *
10952      * In both cases <*flagp> is appropriately set
10953      *
10954      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10955      * 255, must be folded (the former only when the rules indicate it can
10956      * match 'ss')
10957      *
10958      * When it does the populating, it looks at the flag 'downgradable'.  If
10959      * true with a node that folds, it checks if the single code point
10960      * participates in a fold, and if not downgrades the node to an EXACT.
10961      * This helps the optimizer */
10962
10963     bool len_passed_in = cBOOL(len != 0);
10964     U8 character[UTF8_MAXBYTES_CASE+1];
10965
10966     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10967
10968     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10969      * sizing difference, and is extra work that is thrown away */
10970     if (downgradable && ! PASS2) {
10971         downgradable = FALSE;
10972     }
10973
10974     if (! len_passed_in) {
10975         if (UTF) {
10976             if (UNI_IS_INVARIANT(code_point)) {
10977                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10978                     *character = (U8) code_point;
10979                 }
10980                 else { /* Here is /i and not /l (toFOLD() is defined on just
10981                           ASCII, which isn't the same thing as INVARIANT on
10982                           EBCDIC, but it works there, as the extra invariants
10983                           fold to themselves) */
10984                     *character = toFOLD((U8) code_point);
10985                     if (downgradable
10986                         && *character == code_point
10987                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
10988                     {
10989                         OP(node) = EXACT;
10990                     }
10991                 }
10992                 len = 1;
10993             }
10994             else if (FOLD && (! LOC
10995                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10996             {   /* Folding, and ok to do so now */
10997                 UV folded = _to_uni_fold_flags(
10998                                    code_point,
10999                                    character,
11000                                    &len,
11001                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11002                                                       ? FOLD_FLAGS_NOMIX_ASCII
11003                                                       : 0));
11004                 if (downgradable
11005                     && folded == code_point
11006                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11007                 {
11008                     OP(node) = EXACT;
11009                 }
11010             }
11011             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11012
11013                 /* Not folding this cp, and can output it directly */
11014                 *character = UTF8_TWO_BYTE_HI(code_point);
11015                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11016                 len = 2;
11017             }
11018             else {
11019                 uvchr_to_utf8( character, code_point);
11020                 len = UTF8SKIP(character);
11021             }
11022         } /* Else pattern isn't UTF8.  */
11023         else if (! FOLD) {
11024             *character = (U8) code_point;
11025             len = 1;
11026         } /* Else is folded non-UTF8 */
11027         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11028
11029             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11030              * comments at join_exact()); */
11031             *character = (U8) code_point;
11032             len = 1;
11033
11034             /* Can turn into an EXACT node if we know the fold at compile time,
11035              * and it folds to itself and doesn't particpate in other folds */
11036             if (downgradable
11037                 && ! LOC
11038                 && PL_fold_latin1[code_point] == code_point
11039                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11040                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11041             {
11042                 OP(node) = EXACT;
11043             }
11044         } /* else is Sharp s.  May need to fold it */
11045         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11046             *character = 's';
11047             *(character + 1) = 's';
11048             len = 2;
11049         }
11050         else {
11051             *character = LATIN_SMALL_LETTER_SHARP_S;
11052             len = 1;
11053         }
11054     }
11055
11056     if (SIZE_ONLY) {
11057         RExC_size += STR_SZ(len);
11058     }
11059     else {
11060         RExC_emit += STR_SZ(len);
11061         STR_LEN(node) = len;
11062         if (! len_passed_in) {
11063             Copy((char *) character, STRING(node), len, char);
11064         }
11065     }
11066
11067     *flagp |= HASWIDTH;
11068
11069     /* A single character node is SIMPLE, except for the special-cased SHARP S
11070      * under /di. */
11071     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11072         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11073             || ! FOLD || ! DEPENDS_SEMANTICS))
11074     {
11075         *flagp |= SIMPLE;
11076     }
11077
11078     /* The OP may not be well defined in PASS1 */
11079     if (PASS2 && OP(node) == EXACTFL) {
11080         RExC_contains_locale = 1;
11081     }
11082 }
11083
11084
11085 /* return atoi(p), unless it's too big to sensibly be a backref,
11086  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11087
11088 static I32
11089 S_backref_value(char *p)
11090 {
11091     char *q = p;
11092
11093     for (;isDIGIT(*q); q++) {} /* calculate length of num */
11094     if (q - p == 0 || q - p > 9)
11095         return I32_MAX;
11096     return atoi(p);
11097 }
11098
11099
11100 /*
11101  - regatom - the lowest level
11102
11103    Try to identify anything special at the start of the pattern. If there
11104    is, then handle it as required. This may involve generating a single regop,
11105    such as for an assertion; or it may involve recursing, such as to
11106    handle a () structure.
11107
11108    If the string doesn't start with something special then we gobble up
11109    as much literal text as we can.
11110
11111    Once we have been able to handle whatever type of thing started the
11112    sequence, we return.
11113
11114    Note: we have to be careful with escapes, as they can be both literal
11115    and special, and in the case of \10 and friends, context determines which.
11116
11117    A summary of the code structure is:
11118
11119    switch (first_byte) {
11120         cases for each special:
11121             handle this special;
11122             break;
11123         case '\\':
11124             switch (2nd byte) {
11125                 cases for each unambiguous special:
11126                     handle this special;
11127                     break;
11128                 cases for each ambigous special/literal:
11129                     disambiguate;
11130                     if (special)  handle here
11131                     else goto defchar;
11132                 default: // unambiguously literal:
11133                     goto defchar;
11134             }
11135         default:  // is a literal char
11136             // FALL THROUGH
11137         defchar:
11138             create EXACTish node for literal;
11139             while (more input and node isn't full) {
11140                 switch (input_byte) {
11141                    cases for each special;
11142                        make sure parse pointer is set so that the next call to
11143                            regatom will see this special first
11144                        goto loopdone; // EXACTish node terminated by prev. char
11145                    default:
11146                        append char to EXACTISH node;
11147                 }
11148                 get next input byte;
11149             }
11150         loopdone:
11151    }
11152    return the generated node;
11153
11154    Specifically there are two separate switches for handling
11155    escape sequences, with the one for handling literal escapes requiring
11156    a dummy entry for all of the special escapes that are actually handled
11157    by the other.
11158
11159    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11160    TRYAGAIN.
11161    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11162    restarted.
11163    Otherwise does not return NULL.
11164 */
11165
11166 STATIC regnode *
11167 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11168 {
11169     dVAR;
11170     regnode *ret = NULL;
11171     I32 flags = 0;
11172     char *parse_start = RExC_parse;
11173     U8 op;
11174     int invert = 0;
11175
11176     GET_RE_DEBUG_FLAGS_DECL;
11177
11178     *flagp = WORST;             /* Tentatively. */
11179
11180     DEBUG_PARSE("atom");
11181
11182     PERL_ARGS_ASSERT_REGATOM;
11183
11184 tryagain:
11185     switch ((U8)*RExC_parse) {
11186     case '^':
11187         RExC_seen_zerolen++;
11188         nextchar(pRExC_state);
11189         if (RExC_flags & RXf_PMf_MULTILINE)
11190             ret = reg_node(pRExC_state, MBOL);
11191         else if (RExC_flags & RXf_PMf_SINGLELINE)
11192             ret = reg_node(pRExC_state, SBOL);
11193         else
11194             ret = reg_node(pRExC_state, BOL);
11195         Set_Node_Length(ret, 1); /* MJD */
11196         break;
11197     case '$':
11198         nextchar(pRExC_state);
11199         if (*RExC_parse)
11200             RExC_seen_zerolen++;
11201         if (RExC_flags & RXf_PMf_MULTILINE)
11202             ret = reg_node(pRExC_state, MEOL);
11203         else if (RExC_flags & RXf_PMf_SINGLELINE)
11204             ret = reg_node(pRExC_state, SEOL);
11205         else
11206             ret = reg_node(pRExC_state, EOL);
11207         Set_Node_Length(ret, 1); /* MJD */
11208         break;
11209     case '.':
11210         nextchar(pRExC_state);
11211         if (RExC_flags & RXf_PMf_SINGLELINE)
11212             ret = reg_node(pRExC_state, SANY);
11213         else
11214             ret = reg_node(pRExC_state, REG_ANY);
11215         *flagp |= HASWIDTH|SIMPLE;
11216         RExC_naughty++;
11217         Set_Node_Length(ret, 1); /* MJD */
11218         break;
11219     case '[':
11220     {
11221         char * const oregcomp_parse = ++RExC_parse;
11222         ret = regclass(pRExC_state, flagp,depth+1,
11223                        FALSE, /* means parse the whole char class */
11224                        TRUE, /* allow multi-char folds */
11225                        FALSE, /* don't silence non-portable warnings. */
11226                        NULL);
11227         if (*RExC_parse != ']') {
11228             RExC_parse = oregcomp_parse;
11229             vFAIL("Unmatched [");
11230         }
11231         if (ret == NULL) {
11232             if (*flagp & RESTART_UTF8)
11233                 return NULL;
11234             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11235                   (UV) *flagp);
11236         }
11237         nextchar(pRExC_state);
11238         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11239         break;
11240     }
11241     case '(':
11242         nextchar(pRExC_state);
11243         ret = reg(pRExC_state, 2, &flags,depth+1);
11244         if (ret == NULL) {
11245                 if (flags & TRYAGAIN) {
11246                     if (RExC_parse == RExC_end) {
11247                          /* Make parent create an empty node if needed. */
11248                         *flagp |= TRYAGAIN;
11249                         return(NULL);
11250                     }
11251                     goto tryagain;
11252                 }
11253                 if (flags & RESTART_UTF8) {
11254                     *flagp = RESTART_UTF8;
11255                     return NULL;
11256                 }
11257                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11258                                                                  (UV) flags);
11259         }
11260         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11261         break;
11262     case '|':
11263     case ')':
11264         if (flags & TRYAGAIN) {
11265             *flagp |= TRYAGAIN;
11266             return NULL;
11267         }
11268         vFAIL("Internal urp");
11269                                 /* Supposed to be caught earlier. */
11270         break;
11271     case '{':
11272         if (!regcurly(RExC_parse, FALSE)) {
11273             RExC_parse++;
11274             goto defchar;
11275         }
11276         /* FALLTHROUGH */
11277     case '?':
11278     case '+':
11279     case '*':
11280         RExC_parse++;
11281         vFAIL("Quantifier follows nothing");
11282         break;
11283     case '\\':
11284         /* Special Escapes
11285
11286            This switch handles escape sequences that resolve to some kind
11287            of special regop and not to literal text. Escape sequnces that
11288            resolve to literal text are handled below in the switch marked
11289            "Literal Escapes".
11290
11291            Every entry in this switch *must* have a corresponding entry
11292            in the literal escape switch. However, the opposite is not
11293            required, as the default for this switch is to jump to the
11294            literal text handling code.
11295         */
11296         switch ((U8)*++RExC_parse) {
11297             U8 arg;
11298         /* Special Escapes */
11299         case 'A':
11300             RExC_seen_zerolen++;
11301             ret = reg_node(pRExC_state, SBOL);
11302             *flagp |= SIMPLE;
11303             goto finish_meta_pat;
11304         case 'G':
11305             ret = reg_node(pRExC_state, GPOS);
11306             RExC_seen |= REG_GPOS_SEEN;
11307             *flagp |= SIMPLE;
11308             goto finish_meta_pat;
11309         case 'K':
11310             RExC_seen_zerolen++;
11311             ret = reg_node(pRExC_state, KEEPS);
11312             *flagp |= SIMPLE;
11313             /* XXX:dmq : disabling in-place substitution seems to
11314              * be necessary here to avoid cases of memory corruption, as
11315              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11316              */
11317             RExC_seen |= REG_LOOKBEHIND_SEEN;
11318             goto finish_meta_pat;
11319         case 'Z':
11320             ret = reg_node(pRExC_state, SEOL);
11321             *flagp |= SIMPLE;
11322             RExC_seen_zerolen++;                /* Do not optimize RE away */
11323             goto finish_meta_pat;
11324         case 'z':
11325             ret = reg_node(pRExC_state, EOS);
11326             *flagp |= SIMPLE;
11327             RExC_seen_zerolen++;                /* Do not optimize RE away */
11328             goto finish_meta_pat;
11329         case 'C':
11330             ret = reg_node(pRExC_state, CANY);
11331             RExC_seen |= REG_CANY_SEEN;
11332             *flagp |= HASWIDTH|SIMPLE;
11333             goto finish_meta_pat;
11334         case 'X':
11335             ret = reg_node(pRExC_state, CLUMP);
11336             *flagp |= HASWIDTH;
11337             goto finish_meta_pat;
11338
11339         case 'W':
11340             invert = 1;
11341             /* FALLTHROUGH */
11342         case 'w':
11343             arg = ANYOF_WORDCHAR;
11344             goto join_posix;
11345
11346         case 'b':
11347             RExC_seen_zerolen++;
11348             RExC_seen |= REG_LOOKBEHIND_SEEN;
11349             op = BOUND + get_regex_charset(RExC_flags);
11350             if (op > BOUNDA) {  /* /aa is same as /a */
11351                 op = BOUNDA;
11352             }
11353             else if (op == BOUNDL) {
11354                 RExC_contains_locale = 1;
11355             }
11356             ret = reg_node(pRExC_state, op);
11357             FLAGS(ret) = get_regex_charset(RExC_flags);
11358             *flagp |= SIMPLE;
11359             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11360                 /* diag_listed_as: Use "%s" instead of "%s" */
11361                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11362             }
11363             goto finish_meta_pat;
11364         case 'B':
11365             RExC_seen_zerolen++;
11366             RExC_seen |= REG_LOOKBEHIND_SEEN;
11367             op = NBOUND + get_regex_charset(RExC_flags);
11368             if (op > NBOUNDA) { /* /aa is same as /a */
11369                 op = NBOUNDA;
11370             }
11371             else if (op == NBOUNDL) {
11372                 RExC_contains_locale = 1;
11373             }
11374             ret = reg_node(pRExC_state, op);
11375             FLAGS(ret) = get_regex_charset(RExC_flags);
11376             *flagp |= SIMPLE;
11377             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11378                 /* diag_listed_as: Use "%s" instead of "%s" */
11379                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11380             }
11381             goto finish_meta_pat;
11382
11383         case 'D':
11384             invert = 1;
11385             /* FALLTHROUGH */
11386         case 'd':
11387             arg = ANYOF_DIGIT;
11388             goto join_posix;
11389
11390         case 'R':
11391             ret = reg_node(pRExC_state, LNBREAK);
11392             *flagp |= HASWIDTH|SIMPLE;
11393             goto finish_meta_pat;
11394
11395         case 'H':
11396             invert = 1;
11397             /* FALLTHROUGH */
11398         case 'h':
11399             arg = ANYOF_BLANK;
11400             op = POSIXU;
11401             goto join_posix_op_known;
11402
11403         case 'V':
11404             invert = 1;
11405             /* FALLTHROUGH */
11406         case 'v':
11407             arg = ANYOF_VERTWS;
11408             op = POSIXU;
11409             goto join_posix_op_known;
11410
11411         case 'S':
11412             invert = 1;
11413             /* FALLTHROUGH */
11414         case 's':
11415             arg = ANYOF_SPACE;
11416
11417         join_posix:
11418
11419             op = POSIXD + get_regex_charset(RExC_flags);
11420             if (op > POSIXA) {  /* /aa is same as /a */
11421                 op = POSIXA;
11422             }
11423             else if (op == POSIXL) {
11424                 RExC_contains_locale = 1;
11425             }
11426
11427         join_posix_op_known:
11428
11429             if (invert) {
11430                 op += NPOSIXD - POSIXD;
11431             }
11432
11433             ret = reg_node(pRExC_state, op);
11434             if (! SIZE_ONLY) {
11435                 FLAGS(ret) = namedclass_to_classnum(arg);
11436             }
11437
11438             *flagp |= HASWIDTH|SIMPLE;
11439             /* FALLTHROUGH */
11440
11441          finish_meta_pat:
11442             nextchar(pRExC_state);
11443             Set_Node_Length(ret, 2); /* MJD */
11444             break;
11445         case 'p':
11446         case 'P':
11447             {
11448 #ifdef DEBUGGING
11449                 char* parse_start = RExC_parse - 2;
11450 #endif
11451
11452                 RExC_parse--;
11453
11454                 ret = regclass(pRExC_state, flagp,depth+1,
11455                                TRUE, /* means just parse this element */
11456                                FALSE, /* don't allow multi-char folds */
11457                                FALSE, /* don't silence non-portable warnings.
11458                                          It would be a bug if these returned
11459                                          non-portables */
11460                                NULL);
11461                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11462                    are allowed.  */
11463                 if (!ret)
11464                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11465                           (UV) *flagp);
11466
11467                 RExC_parse--;
11468
11469                 Set_Node_Offset(ret, parse_start + 2);
11470                 Set_Node_Cur_Length(ret, parse_start);
11471                 nextchar(pRExC_state);
11472             }
11473             break;
11474         case 'N':
11475             /* Handle \N and \N{NAME} with multiple code points here and not
11476              * below because it can be multicharacter. join_exact() will join
11477              * them up later on.  Also this makes sure that things like
11478              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11479              * The options to the grok function call causes it to fail if the
11480              * sequence is just a single code point.  We then go treat it as
11481              * just another character in the current EXACT node, and hence it
11482              * gets uniform treatment with all the other characters.  The
11483              * special treatment for quantifiers is not needed for such single
11484              * character sequences */
11485             ++RExC_parse;
11486             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11487                                 FALSE /* not strict */ )) {
11488                 if (*flagp & RESTART_UTF8)
11489                     return NULL;
11490                 RExC_parse--;
11491                 goto defchar;
11492             }
11493             break;
11494         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11495         parse_named_seq:
11496         {
11497             char ch= RExC_parse[1];
11498             if (ch != '<' && ch != '\'' && ch != '{') {
11499                 RExC_parse++;
11500                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11501                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11502             } else {
11503                 /* this pretty much dupes the code for (?P=...) in reg(), if
11504                    you change this make sure you change that */
11505                 char* name_start = (RExC_parse += 2);
11506                 U32 num = 0;
11507                 SV *sv_dat = reg_scan_name(pRExC_state,
11508                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11509                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11510                 if (RExC_parse == name_start || *RExC_parse != ch)
11511                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11512                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11513
11514                 if (!SIZE_ONLY) {
11515                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11516                     RExC_rxi->data->data[num]=(void*)sv_dat;
11517                     SvREFCNT_inc_simple_void(sv_dat);
11518                 }
11519
11520                 RExC_sawback = 1;
11521                 ret = reganode(pRExC_state,
11522                                ((! FOLD)
11523                                  ? NREF
11524                                  : (ASCII_FOLD_RESTRICTED)
11525                                    ? NREFFA
11526                                    : (AT_LEAST_UNI_SEMANTICS)
11527                                      ? NREFFU
11528                                      : (LOC)
11529                                        ? NREFFL
11530                                        : NREFF),
11531                                 num);
11532                 *flagp |= HASWIDTH;
11533
11534                 /* override incorrect value set in reganode MJD */
11535                 Set_Node_Offset(ret, parse_start+1);
11536                 Set_Node_Cur_Length(ret, parse_start);
11537                 nextchar(pRExC_state);
11538
11539             }
11540             break;
11541         }
11542         case 'g':
11543         case '1': case '2': case '3': case '4':
11544         case '5': case '6': case '7': case '8': case '9':
11545             {
11546                 I32 num;
11547                 bool hasbrace = 0;
11548
11549                 if (*RExC_parse == 'g') {
11550                     bool isrel = 0;
11551
11552                     RExC_parse++;
11553                     if (*RExC_parse == '{') {
11554                         RExC_parse++;
11555                         hasbrace = 1;
11556                     }
11557                     if (*RExC_parse == '-') {
11558                         RExC_parse++;
11559                         isrel = 1;
11560                     }
11561                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11562                         if (isrel) RExC_parse--;
11563                         RExC_parse -= 2;
11564                         goto parse_named_seq;
11565                     }
11566
11567                     num = S_backref_value(RExC_parse);
11568                     if (num == 0)
11569                         vFAIL("Reference to invalid group 0");
11570                     else if (num == I32_MAX) {
11571                          if (isDIGIT(*RExC_parse))
11572                             vFAIL("Reference to nonexistent group");
11573                         else
11574                             vFAIL("Unterminated \\g... pattern");
11575                     }
11576
11577                     if (isrel) {
11578                         num = RExC_npar - num;
11579                         if (num < 1)
11580                             vFAIL("Reference to nonexistent or unclosed group");
11581                     }
11582                 }
11583                 else {
11584                     num = S_backref_value(RExC_parse);
11585                     /* bare \NNN might be backref or octal - if it is larger than or equal
11586                      * RExC_npar then it is assumed to be and octal escape.
11587                      * Note RExC_npar is +1 from the actual number of parens*/
11588                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11589                             && *RExC_parse != '8' && *RExC_parse != '9'))
11590                     {
11591                         /* Probably a character specified in octal, e.g. \35 */
11592                         goto defchar;
11593                     }
11594                 }
11595
11596                 /* at this point RExC_parse definitely points to a backref
11597                  * number */
11598                 {
11599 #ifdef RE_TRACK_PATTERN_OFFSETS
11600                     char * const parse_start = RExC_parse - 1; /* MJD */
11601 #endif
11602                     while (isDIGIT(*RExC_parse))
11603                         RExC_parse++;
11604                     if (hasbrace) {
11605                         if (*RExC_parse != '}')
11606                             vFAIL("Unterminated \\g{...} pattern");
11607                         RExC_parse++;
11608                     }
11609                     if (!SIZE_ONLY) {
11610                         if (num > (I32)RExC_rx->nparens)
11611                             vFAIL("Reference to nonexistent group");
11612                     }
11613                     RExC_sawback = 1;
11614                     ret = reganode(pRExC_state,
11615                                    ((! FOLD)
11616                                      ? REF
11617                                      : (ASCII_FOLD_RESTRICTED)
11618                                        ? REFFA
11619                                        : (AT_LEAST_UNI_SEMANTICS)
11620                                          ? REFFU
11621                                          : (LOC)
11622                                            ? REFFL
11623                                            : REFF),
11624                                     num);
11625                     *flagp |= HASWIDTH;
11626
11627                     /* override incorrect value set in reganode MJD */
11628                     Set_Node_Offset(ret, parse_start+1);
11629                     Set_Node_Cur_Length(ret, parse_start);
11630                     RExC_parse--;
11631                     nextchar(pRExC_state);
11632                 }
11633             }
11634             break;
11635         case '\0':
11636             if (RExC_parse >= RExC_end)
11637                 FAIL("Trailing \\");
11638             /* FALLTHROUGH */
11639         default:
11640             /* Do not generate "unrecognized" warnings here, we fall
11641                back into the quick-grab loop below */
11642             parse_start--;
11643             goto defchar;
11644         }
11645         break;
11646
11647     case '#':
11648         if (RExC_flags & RXf_PMf_EXTENDED) {
11649             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11650             if (RExC_parse < RExC_end)
11651                 goto tryagain;
11652         }
11653         /* FALLTHROUGH */
11654
11655     default:
11656
11657             parse_start = RExC_parse - 1;
11658
11659             RExC_parse++;
11660
11661         defchar: {
11662             STRLEN len = 0;
11663             UV ender = 0;
11664             char *p;
11665             char *s;
11666 #define MAX_NODE_STRING_SIZE 127
11667             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11668             char *s0;
11669             U8 upper_parse = MAX_NODE_STRING_SIZE;
11670             U8 node_type = compute_EXACTish(pRExC_state);
11671             bool next_is_quantifier;
11672             char * oldp = NULL;
11673
11674             /* We can convert EXACTF nodes to EXACTFU if they contain only
11675              * characters that match identically regardless of the target
11676              * string's UTF8ness.  The reason to do this is that EXACTF is not
11677              * trie-able, EXACTFU is.
11678              *
11679              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11680              * contain only above-Latin1 characters (hence must be in UTF8),
11681              * which don't participate in folds with Latin1-range characters,
11682              * as the latter's folds aren't known until runtime.  (We don't
11683              * need to figure this out until pass 2) */
11684             bool maybe_exactfu = PASS2
11685                                && (node_type == EXACTF || node_type == EXACTFL);
11686
11687             /* If a folding node contains only code points that don't
11688              * participate in folds, it can be changed into an EXACT node,
11689              * which allows the optimizer more things to look for */
11690             bool maybe_exact;
11691
11692             ret = reg_node(pRExC_state, node_type);
11693
11694             /* In pass1, folded, we use a temporary buffer instead of the
11695              * actual node, as the node doesn't exist yet */
11696             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11697
11698             s0 = s;
11699
11700         reparse:
11701
11702             /* We do the EXACTFish to EXACT node only if folding.  (And we
11703              * don't need to figure this out until pass 2) */
11704             maybe_exact = FOLD && PASS2;
11705
11706             /* XXX The node can hold up to 255 bytes, yet this only goes to
11707              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11708              * 255 allows us to not have to worry about overflow due to
11709              * converting to utf8 and fold expansion, but that value is
11710              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11711              * split up by this limit into a single one using the real max of
11712              * 255.  Even at 127, this breaks under rare circumstances.  If
11713              * folding, we do not want to split a node at a character that is a
11714              * non-final in a multi-char fold, as an input string could just
11715              * happen to want to match across the node boundary.  The join
11716              * would solve that problem if the join actually happens.  But a
11717              * series of more than two nodes in a row each of 127 would cause
11718              * the first join to succeed to get to 254, but then there wouldn't
11719              * be room for the next one, which could at be one of those split
11720              * multi-char folds.  I don't know of any fool-proof solution.  One
11721              * could back off to end with only a code point that isn't such a
11722              * non-final, but it is possible for there not to be any in the
11723              * entire node. */
11724             for (p = RExC_parse - 1;
11725                  len < upper_parse && p < RExC_end;
11726                  len++)
11727             {
11728                 oldp = p;
11729
11730                 if (RExC_flags & RXf_PMf_EXTENDED)
11731                     p = regpatws(pRExC_state, p,
11732                                           TRUE); /* means recognize comments */
11733                 switch ((U8)*p) {
11734                 case '^':
11735                 case '$':
11736                 case '.':
11737                 case '[':
11738                 case '(':
11739                 case ')':
11740                 case '|':
11741                     goto loopdone;
11742                 case '\\':
11743                     /* Literal Escapes Switch
11744
11745                        This switch is meant to handle escape sequences that
11746                        resolve to a literal character.
11747
11748                        Every escape sequence that represents something
11749                        else, like an assertion or a char class, is handled
11750                        in the switch marked 'Special Escapes' above in this
11751                        routine, but also has an entry here as anything that
11752                        isn't explicitly mentioned here will be treated as
11753                        an unescaped equivalent literal.
11754                     */
11755
11756                     switch ((U8)*++p) {
11757                     /* These are all the special escapes. */
11758                     case 'A':             /* Start assertion */
11759                     case 'b': case 'B':   /* Word-boundary assertion*/
11760                     case 'C':             /* Single char !DANGEROUS! */
11761                     case 'd': case 'D':   /* digit class */
11762                     case 'g': case 'G':   /* generic-backref, pos assertion */
11763                     case 'h': case 'H':   /* HORIZWS */
11764                     case 'k': case 'K':   /* named backref, keep marker */
11765                     case 'p': case 'P':   /* Unicode property */
11766                               case 'R':   /* LNBREAK */
11767                     case 's': case 'S':   /* space class */
11768                     case 'v': case 'V':   /* VERTWS */
11769                     case 'w': case 'W':   /* word class */
11770                     case 'X':             /* eXtended Unicode "combining
11771                                              character sequence" */
11772                     case 'z': case 'Z':   /* End of line/string assertion */
11773                         --p;
11774                         goto loopdone;
11775
11776                     /* Anything after here is an escape that resolves to a
11777                        literal. (Except digits, which may or may not)
11778                      */
11779                     case 'n':
11780                         ender = '\n';
11781                         p++;
11782                         break;
11783                     case 'N': /* Handle a single-code point named character. */
11784                         /* The options cause it to fail if a multiple code
11785                          * point sequence.  Handle those in the switch() above
11786                          * */
11787                         RExC_parse = p + 1;
11788                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11789                                             flagp, depth, FALSE,
11790                                             FALSE /* not strict */ ))
11791                         {
11792                             if (*flagp & RESTART_UTF8)
11793                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11794                             RExC_parse = p = oldp;
11795                             goto loopdone;
11796                         }
11797                         p = RExC_parse;
11798                         if (ender > 0xff) {
11799                             REQUIRE_UTF8;
11800                         }
11801                         break;
11802                     case 'r':
11803                         ender = '\r';
11804                         p++;
11805                         break;
11806                     case 't':
11807                         ender = '\t';
11808                         p++;
11809                         break;
11810                     case 'f':
11811                         ender = '\f';
11812                         p++;
11813                         break;
11814                     case 'e':
11815                           ender = ASCII_TO_NATIVE('\033');
11816                         p++;
11817                         break;
11818                     case 'a':
11819                           ender = '\a';
11820                         p++;
11821                         break;
11822                     case 'o':
11823                         {
11824                             UV result;
11825                             const char* error_msg;
11826
11827                             bool valid = grok_bslash_o(&p,
11828                                                        &result,
11829                                                        &error_msg,
11830                                                        TRUE, /* out warnings */
11831                                                        FALSE, /* not strict */
11832                                                        TRUE, /* Output warnings
11833                                                                 for non-
11834                                                                 portables */
11835                                                        UTF);
11836                             if (! valid) {
11837                                 RExC_parse = p; /* going to die anyway; point
11838                                                    to exact spot of failure */
11839                                 vFAIL(error_msg);
11840                             }
11841                             ender = result;
11842                             if (PL_encoding && ender < 0x100) {
11843                                 goto recode_encoding;
11844                             }
11845                             if (ender > 0xff) {
11846                                 REQUIRE_UTF8;
11847                             }
11848                             break;
11849                         }
11850                     case 'x':
11851                         {
11852                             UV result = UV_MAX; /* initialize to erroneous
11853                                                    value */
11854                             const char* error_msg;
11855
11856                             bool valid = grok_bslash_x(&p,
11857                                                        &result,
11858                                                        &error_msg,
11859                                                        TRUE, /* out warnings */
11860                                                        FALSE, /* not strict */
11861                                                        TRUE, /* Output warnings
11862                                                                 for non-
11863                                                                 portables */
11864                                                        UTF);
11865                             if (! valid) {
11866                                 RExC_parse = p; /* going to die anyway; point
11867                                                    to exact spot of failure */
11868                                 vFAIL(error_msg);
11869                             }
11870                             ender = result;
11871
11872                             if (PL_encoding && ender < 0x100) {
11873                                 goto recode_encoding;
11874                             }
11875                             if (ender > 0xff) {
11876                                 REQUIRE_UTF8;
11877                             }
11878                             break;
11879                         }
11880                     case 'c':
11881                         p++;
11882                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11883                         break;
11884                     case '8': case '9': /* must be a backreference */
11885                         --p;
11886                         goto loopdone;
11887                     case '1': case '2': case '3':case '4':
11888                     case '5': case '6': case '7':
11889                         /* When we parse backslash escapes there is ambiguity
11890                          * between backreferences and octal escapes. Any escape
11891                          * from \1 - \9 is a backreference, any multi-digit
11892                          * escape which does not start with 0 and which when
11893                          * evaluated as decimal could refer to an already
11894                          * parsed capture buffer is a backslash. Anything else
11895                          * is octal.
11896                          *
11897                          * Note this implies that \118 could be interpreted as
11898                          * 118 OR as "\11" . "8" depending on whether there
11899                          * were 118 capture buffers defined already in the
11900                          * pattern.  */
11901
11902                         /* NOTE, RExC_npar is 1 more than the actual number of
11903                          * parens we have seen so far, hence the < RExC_npar below. */
11904
11905                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11906                         {  /* Not to be treated as an octal constant, go
11907                                    find backref */
11908                             --p;
11909                             goto loopdone;
11910                         }
11911                         /* FALLTHROUGH */
11912                     case '0':
11913                         {
11914                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11915                             STRLEN numlen = 3;
11916                             ender = grok_oct(p, &numlen, &flags, NULL);
11917                             if (ender > 0xff) {
11918                                 REQUIRE_UTF8;
11919                             }
11920                             p += numlen;
11921                             if (SIZE_ONLY   /* like \08, \178 */
11922                                 && numlen < 3
11923                                 && p < RExC_end
11924                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11925                             {
11926                                 reg_warn_non_literal_string(
11927                                          p + 1,
11928                                          form_short_octal_warning(p, numlen));
11929                             }
11930                         }
11931                         if (PL_encoding && ender < 0x100)
11932                             goto recode_encoding;
11933                         break;
11934                     recode_encoding:
11935                         if (! RExC_override_recoding) {
11936                             SV* enc = PL_encoding;
11937                             ender = reg_recode((const char)(U8)ender, &enc);
11938                             if (!enc && SIZE_ONLY)
11939                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11940                             REQUIRE_UTF8;
11941                         }
11942                         break;
11943                     case '\0':
11944                         if (p >= RExC_end)
11945                             FAIL("Trailing \\");
11946                         /* FALLTHROUGH */
11947                     default:
11948                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11949                             /* Include any { following the alpha to emphasize
11950                              * that it could be part of an escape at some point
11951                              * in the future */
11952                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11953                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11954                         }
11955                         goto normal_default;
11956                     } /* End of switch on '\' */
11957                     break;
11958                 default:    /* A literal character */
11959
11960                   normal_default:
11961                     if (UTF8_IS_START(*p) && UTF) {
11962                         STRLEN numlen;
11963                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11964                                                &numlen, UTF8_ALLOW_DEFAULT);
11965                         p += numlen;
11966                     }
11967                     else
11968                         ender = (U8) *p++;
11969                     break;
11970                 } /* End of switch on the literal */
11971
11972                 /* Here, have looked at the literal character and <ender>
11973                  * contains its ordinal, <p> points to the character after it
11974                  */
11975
11976                 if ( RExC_flags & RXf_PMf_EXTENDED)
11977                     p = regpatws(pRExC_state, p,
11978                                           TRUE); /* means recognize comments */
11979
11980                 /* If the next thing is a quantifier, it applies to this
11981                  * character only, which means that this character has to be in
11982                  * its own node and can't just be appended to the string in an
11983                  * existing node, so if there are already other characters in
11984                  * the node, close the node with just them, and set up to do
11985                  * this character again next time through, when it will be the
11986                  * only thing in its new node */
11987                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11988                 {
11989                     p = oldp;
11990                     goto loopdone;
11991                 }
11992
11993                 if (! FOLD   /* The simple case, just append the literal */
11994                     || (LOC  /* Also don't fold for tricky chars under /l */
11995                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
11996                 {
11997                     if (UTF) {
11998                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11999                         if (unilen > 0) {
12000                            s   += unilen;
12001                            len += unilen;
12002                         }
12003
12004                         /* The loop increments <len> each time, as all but this
12005                          * path (and one other) through it add a single byte to
12006                          * the EXACTish node.  But this one has changed len to
12007                          * be the correct final value, so subtract one to
12008                          * cancel out the increment that follows */
12009                         len--;
12010                     }
12011                     else {
12012                         REGC((char)ender, s++);
12013                     }
12014
12015                     /* Can get here if folding only if is one of the /l
12016                      * characters whose fold depends on the locale.  The
12017                      * occurrence of any of these indicate that we can't
12018                      * simplify things */
12019                     if (FOLD) {
12020                         maybe_exact = FALSE;
12021                         maybe_exactfu = FALSE;
12022                     }
12023                 }
12024                 else             /* FOLD */
12025                      if (! ( UTF
12026                         /* See comments for join_exact() as to why we fold this
12027                          * non-UTF at compile time */
12028                         || (node_type == EXACTFU
12029                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12030                 {
12031                     /* Here, are folding and are not UTF-8 encoded; therefore
12032                      * the character must be in the range 0-255, and is not /l
12033                      * (Not /l because we already handled these under /l in
12034                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12035                     if (IS_IN_SOME_FOLD_L1(ender)) {
12036                         maybe_exact = FALSE;
12037
12038                         /* See if the character's fold differs between /d and
12039                          * /u.  This includes the multi-char fold SHARP S to
12040                          * 'ss' */
12041                         if (maybe_exactfu
12042                             && (PL_fold[ender] != PL_fold_latin1[ender]
12043                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12044                                 || (len > 0
12045                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12046                                    && isARG2_lower_or_UPPER_ARG1('s',
12047                                                                  *(s-1)))))
12048                         {
12049                             maybe_exactfu = FALSE;
12050                         }
12051                     }
12052
12053                     /* Even when folding, we store just the input character, as
12054                      * we have an array that finds its fold quickly */
12055                     *(s++) = (char) ender;
12056                 }
12057                 else {  /* FOLD and UTF */
12058                     /* Unlike the non-fold case, we do actually have to
12059                      * calculate the results here in pass 1.  This is for two
12060                      * reasons, the folded length may be longer than the
12061                      * unfolded, and we have to calculate how many EXACTish
12062                      * nodes it will take; and we may run out of room in a node
12063                      * in the middle of a potential multi-char fold, and have
12064                      * to back off accordingly.  (Hence we can't use REGC for
12065                      * the simple case just below.) */
12066
12067                     UV folded;
12068                     if (isASCII(ender)) {
12069                         folded = toFOLD(ender);
12070                         *(s)++ = (U8) folded;
12071                     }
12072                     else {
12073                         STRLEN foldlen;
12074
12075                         folded = _to_uni_fold_flags(
12076                                      ender,
12077                                      (U8 *) s,
12078                                      &foldlen,
12079                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12080                                                         ? FOLD_FLAGS_NOMIX_ASCII
12081                                                         : 0));
12082                         s += foldlen;
12083
12084                         /* The loop increments <len> each time, as all but this
12085                          * path (and one other) through it add a single byte to
12086                          * the EXACTish node.  But this one has changed len to
12087                          * be the correct final value, so subtract one to
12088                          * cancel out the increment that follows */
12089                         len += foldlen - 1;
12090                     }
12091                     /* If this node only contains non-folding code points so
12092                      * far, see if this new one is also non-folding */
12093                     if (maybe_exact) {
12094                         if (folded != ender) {
12095                             maybe_exact = FALSE;
12096                         }
12097                         else {
12098                             /* Here the fold is the original; we have to check
12099                              * further to see if anything folds to it */
12100                             if (_invlist_contains_cp(PL_utf8_foldable,
12101                                                         ender))
12102                             {
12103                                 maybe_exact = FALSE;
12104                             }
12105                         }
12106                     }
12107                     ender = folded;
12108                 }
12109
12110                 if (next_is_quantifier) {
12111
12112                     /* Here, the next input is a quantifier, and to get here,
12113                      * the current character is the only one in the node.
12114                      * Also, here <len> doesn't include the final byte for this
12115                      * character */
12116                     len++;
12117                     goto loopdone;
12118                 }
12119
12120             } /* End of loop through literal characters */
12121
12122             /* Here we have either exhausted the input or ran out of room in
12123              * the node.  (If we encountered a character that can't be in the
12124              * node, transfer is made directly to <loopdone>, and so we
12125              * wouldn't have fallen off the end of the loop.)  In the latter
12126              * case, we artificially have to split the node into two, because
12127              * we just don't have enough space to hold everything.  This
12128              * creates a problem if the final character participates in a
12129              * multi-character fold in the non-final position, as a match that
12130              * should have occurred won't, due to the way nodes are matched,
12131              * and our artificial boundary.  So back off until we find a non-
12132              * problematic character -- one that isn't at the beginning or
12133              * middle of such a fold.  (Either it doesn't participate in any
12134              * folds, or appears only in the final position of all the folds it
12135              * does participate in.)  A better solution with far fewer false
12136              * positives, and that would fill the nodes more completely, would
12137              * be to actually have available all the multi-character folds to
12138              * test against, and to back-off only far enough to be sure that
12139              * this node isn't ending with a partial one.  <upper_parse> is set
12140              * further below (if we need to reparse the node) to include just
12141              * up through that final non-problematic character that this code
12142              * identifies, so when it is set to less than the full node, we can
12143              * skip the rest of this */
12144             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12145
12146                 const STRLEN full_len = len;
12147
12148                 assert(len >= MAX_NODE_STRING_SIZE);
12149
12150                 /* Here, <s> points to the final byte of the final character.
12151                  * Look backwards through the string until find a non-
12152                  * problematic character */
12153
12154                 if (! UTF) {
12155
12156                     /* This has no multi-char folds to non-UTF characters */
12157                     if (ASCII_FOLD_RESTRICTED) {
12158                         goto loopdone;
12159                     }
12160
12161                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12162                     len = s - s0 + 1;
12163                 }
12164                 else {
12165                     if (!  PL_NonL1NonFinalFold) {
12166                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12167                                         NonL1_Perl_Non_Final_Folds_invlist);
12168                     }
12169
12170                     /* Point to the first byte of the final character */
12171                     s = (char *) utf8_hop((U8 *) s, -1);
12172
12173                     while (s >= s0) {   /* Search backwards until find
12174                                            non-problematic char */
12175                         if (UTF8_IS_INVARIANT(*s)) {
12176
12177                             /* There are no ascii characters that participate
12178                              * in multi-char folds under /aa.  In EBCDIC, the
12179                              * non-ascii invariants are all control characters,
12180                              * so don't ever participate in any folds. */
12181                             if (ASCII_FOLD_RESTRICTED
12182                                 || ! IS_NON_FINAL_FOLD(*s))
12183                             {
12184                                 break;
12185                             }
12186                         }
12187                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12188                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12189                                                                   *s, *(s+1))))
12190                             {
12191                                 break;
12192                             }
12193                         }
12194                         else if (! _invlist_contains_cp(
12195                                         PL_NonL1NonFinalFold,
12196                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12197                         {
12198                             break;
12199                         }
12200
12201                         /* Here, the current character is problematic in that
12202                          * it does occur in the non-final position of some
12203                          * fold, so try the character before it, but have to
12204                          * special case the very first byte in the string, so
12205                          * we don't read outside the string */
12206                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12207                     } /* End of loop backwards through the string */
12208
12209                     /* If there were only problematic characters in the string,
12210                      * <s> will point to before s0, in which case the length
12211                      * should be 0, otherwise include the length of the
12212                      * non-problematic character just found */
12213                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12214                 }
12215
12216                 /* Here, have found the final character, if any, that is
12217                  * non-problematic as far as ending the node without splitting
12218                  * it across a potential multi-char fold.  <len> contains the
12219                  * number of bytes in the node up-to and including that
12220                  * character, or is 0 if there is no such character, meaning
12221                  * the whole node contains only problematic characters.  In
12222                  * this case, give up and just take the node as-is.  We can't
12223                  * do any better */
12224                 if (len == 0) {
12225                     len = full_len;
12226
12227                     /* If the node ends in an 's' we make sure it stays EXACTF,
12228                      * as if it turns into an EXACTFU, it could later get
12229                      * joined with another 's' that would then wrongly match
12230                      * the sharp s */
12231                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12232                     {
12233                         maybe_exactfu = FALSE;
12234                     }
12235                 } else {
12236
12237                     /* Here, the node does contain some characters that aren't
12238                      * problematic.  If one such is the final character in the
12239                      * node, we are done */
12240                     if (len == full_len) {
12241                         goto loopdone;
12242                     }
12243                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12244
12245                         /* If the final character is problematic, but the
12246                          * penultimate is not, back-off that last character to
12247                          * later start a new node with it */
12248                         p = oldp;
12249                         goto loopdone;
12250                     }
12251
12252                     /* Here, the final non-problematic character is earlier
12253                      * in the input than the penultimate character.  What we do
12254                      * is reparse from the beginning, going up only as far as
12255                      * this final ok one, thus guaranteeing that the node ends
12256                      * in an acceptable character.  The reason we reparse is
12257                      * that we know how far in the character is, but we don't
12258                      * know how to correlate its position with the input parse.
12259                      * An alternate implementation would be to build that
12260                      * correlation as we go along during the original parse,
12261                      * but that would entail extra work for every node, whereas
12262                      * this code gets executed only when the string is too
12263                      * large for the node, and the final two characters are
12264                      * problematic, an infrequent occurrence.  Yet another
12265                      * possible strategy would be to save the tail of the
12266                      * string, and the next time regatom is called, initialize
12267                      * with that.  The problem with this is that unless you
12268                      * back off one more character, you won't be guaranteed
12269                      * regatom will get called again, unless regbranch,
12270                      * regpiece ... are also changed.  If you do back off that
12271                      * extra character, so that there is input guaranteed to
12272                      * force calling regatom, you can't handle the case where
12273                      * just the first character in the node is acceptable.  I
12274                      * (khw) decided to try this method which doesn't have that
12275                      * pitfall; if performance issues are found, we can do a
12276                      * combination of the current approach plus that one */
12277                     upper_parse = len;
12278                     len = 0;
12279                     s = s0;
12280                     goto reparse;
12281                 }
12282             }   /* End of verifying node ends with an appropriate char */
12283
12284         loopdone:   /* Jumped to when encounters something that shouldn't be in
12285                        the node */
12286
12287             /* I (khw) don't know if you can get here with zero length, but the
12288              * old code handled this situation by creating a zero-length EXACT
12289              * node.  Might as well be NOTHING instead */
12290             if (len == 0) {
12291                 OP(ret) = NOTHING;
12292             }
12293             else {
12294                 if (FOLD) {
12295                     /* If 'maybe_exact' is still set here, means there are no
12296                      * code points in the node that participate in folds;
12297                      * similarly for 'maybe_exactfu' and code points that match
12298                      * differently depending on UTF8ness of the target string
12299                      * (for /u), or depending on locale for /l */
12300                     if (maybe_exact) {
12301                         OP(ret) = EXACT;
12302                     }
12303                     else if (maybe_exactfu) {
12304                         OP(ret) = EXACTFU;
12305                     }
12306                 }
12307                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12308                                            FALSE /* Don't look to see if could
12309                                                     be turned into an EXACT
12310                                                     node, as we have already
12311                                                     computed that */
12312                                           );
12313             }
12314
12315             RExC_parse = p - 1;
12316             Set_Node_Cur_Length(ret, parse_start);
12317             nextchar(pRExC_state);
12318             {
12319                 /* len is STRLEN which is unsigned, need to copy to signed */
12320                 IV iv = len;
12321                 if (iv < 0)
12322                     vFAIL("Internal disaster");
12323             }
12324
12325         } /* End of label 'defchar:' */
12326         break;
12327     } /* End of giant switch on input character */
12328
12329     return(ret);
12330 }
12331
12332 STATIC char *
12333 S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12334 {
12335     /* Returns the next non-pattern-white space, non-comment character (the
12336      * latter only if 'recognize_comment is true) in the string p, which is
12337      * ended by RExC_end.  See also reg_skipcomment */
12338     const char *e = RExC_end;
12339
12340     PERL_ARGS_ASSERT_REGPATWS;
12341
12342     while (p < e) {
12343         STRLEN len;
12344         if ((len = is_PATWS_safe(p, e, UTF))) {
12345             p += len;
12346         }
12347         else if (recognize_comment && *p == '#') {
12348             p = reg_skipcomment(pRExC_state, p);
12349         }
12350         else
12351             break;
12352     }
12353     return p;
12354 }
12355
12356 STATIC void
12357 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12358 {
12359     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12360      * sets up the bitmap and any flags, removing those code points from the
12361      * inversion list, setting it to NULL should it become completely empty */
12362
12363     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12364     assert(PL_regkind[OP(node)] == ANYOF);
12365
12366     ANYOF_BITMAP_ZERO(node);
12367     if (*invlist_ptr) {
12368
12369         /* This gets set if we actually need to modify things */
12370         bool change_invlist = FALSE;
12371
12372         UV start, end;
12373
12374         /* Start looking through *invlist_ptr */
12375         invlist_iterinit(*invlist_ptr);
12376         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12377             UV high;
12378             int i;
12379
12380             if (end == UV_MAX && start <= 256) {
12381                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12382             }
12383             else if (end >= 256) {
12384                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12385             }
12386
12387             /* Quit if are above what we should change */
12388             if (start > 255) {
12389                 break;
12390             }
12391
12392             change_invlist = TRUE;
12393
12394             /* Set all the bits in the range, up to the max that we are doing */
12395             high = (end < 255) ? end : 255;
12396             for (i = start; i <= (int) high; i++) {
12397                 if (! ANYOF_BITMAP_TEST(node, i)) {
12398                     ANYOF_BITMAP_SET(node, i);
12399                 }
12400             }
12401         }
12402         invlist_iterfinish(*invlist_ptr);
12403
12404         /* Done with loop; remove any code points that are in the bitmap from
12405          * *invlist_ptr; similarly for code points above latin1 if we have a
12406          * flag to match all of them anyways */
12407         if (change_invlist) {
12408             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12409         }
12410         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12411             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12412         }
12413
12414         /* If have completely emptied it, remove it completely */
12415         if (_invlist_len(*invlist_ptr) == 0) {
12416             SvREFCNT_dec_NN(*invlist_ptr);
12417             *invlist_ptr = NULL;
12418         }
12419     }
12420 }
12421
12422 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12423    Character classes ([:foo:]) can also be negated ([:^foo:]).
12424    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12425    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12426    but trigger failures because they are currently unimplemented. */
12427
12428 #define POSIXCC_DONE(c)   ((c) == ':')
12429 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12430 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12431
12432 PERL_STATIC_INLINE I32
12433 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12434 {
12435     dVAR;
12436     I32 namedclass = OOB_NAMEDCLASS;
12437
12438     PERL_ARGS_ASSERT_REGPPOSIXCC;
12439
12440     if (value == '[' && RExC_parse + 1 < RExC_end &&
12441         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12442         POSIXCC(UCHARAT(RExC_parse)))
12443     {
12444         const char c = UCHARAT(RExC_parse);
12445         char* const s = RExC_parse++;
12446
12447         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12448             RExC_parse++;
12449         if (RExC_parse == RExC_end) {
12450             if (strict) {
12451
12452                 /* Try to give a better location for the error (than the end of
12453                  * the string) by looking for the matching ']' */
12454                 RExC_parse = s;
12455                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12456                     RExC_parse++;
12457                 }
12458                 vFAIL2("Unmatched '%c' in POSIX class", c);
12459             }
12460             /* Grandfather lone [:, [=, [. */
12461             RExC_parse = s;
12462         }
12463         else {
12464             const char* const t = RExC_parse++; /* skip over the c */
12465             assert(*t == c);
12466
12467             if (UCHARAT(RExC_parse) == ']') {
12468                 const char *posixcc = s + 1;
12469                 RExC_parse++; /* skip over the ending ] */
12470
12471                 if (*s == ':') {
12472                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12473                     const I32 skip = t - posixcc;
12474
12475                     /* Initially switch on the length of the name.  */
12476                     switch (skip) {
12477                     case 4:
12478                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12479                                                           this is the Perl \w
12480                                                         */
12481                             namedclass = ANYOF_WORDCHAR;
12482                         break;
12483                     case 5:
12484                         /* Names all of length 5.  */
12485                         /* alnum alpha ascii blank cntrl digit graph lower
12486                            print punct space upper  */
12487                         /* Offset 4 gives the best switch position.  */
12488                         switch (posixcc[4]) {
12489                         case 'a':
12490                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12491                                 namedclass = ANYOF_ALPHA;
12492                             break;
12493                         case 'e':
12494                             if (memEQ(posixcc, "spac", 4)) /* space */
12495                                 namedclass = ANYOF_PSXSPC;
12496                             break;
12497                         case 'h':
12498                             if (memEQ(posixcc, "grap", 4)) /* graph */
12499                                 namedclass = ANYOF_GRAPH;
12500                             break;
12501                         case 'i':
12502                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12503                                 namedclass = ANYOF_ASCII;
12504                             break;
12505                         case 'k':
12506                             if (memEQ(posixcc, "blan", 4)) /* blank */
12507                                 namedclass = ANYOF_BLANK;
12508                             break;
12509                         case 'l':
12510                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12511                                 namedclass = ANYOF_CNTRL;
12512                             break;
12513                         case 'm':
12514                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12515                                 namedclass = ANYOF_ALPHANUMERIC;
12516                             break;
12517                         case 'r':
12518                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12519                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12520                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12521                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12522                             break;
12523                         case 't':
12524                             if (memEQ(posixcc, "digi", 4)) /* digit */
12525                                 namedclass = ANYOF_DIGIT;
12526                             else if (memEQ(posixcc, "prin", 4)) /* print */
12527                                 namedclass = ANYOF_PRINT;
12528                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12529                                 namedclass = ANYOF_PUNCT;
12530                             break;
12531                         }
12532                         break;
12533                     case 6:
12534                         if (memEQ(posixcc, "xdigit", 6))
12535                             namedclass = ANYOF_XDIGIT;
12536                         break;
12537                     }
12538
12539                     if (namedclass == OOB_NAMEDCLASS)
12540                         vFAIL2utf8f(
12541                             "POSIX class [:%"UTF8f":] unknown",
12542                             UTF8fARG(UTF, t - s - 1, s + 1));
12543
12544                     /* The #defines are structured so each complement is +1 to
12545                      * the normal one */
12546                     if (complement) {
12547                         namedclass++;
12548                     }
12549                     assert (posixcc[skip] == ':');
12550                     assert (posixcc[skip+1] == ']');
12551                 } else if (!SIZE_ONLY) {
12552                     /* [[=foo=]] and [[.foo.]] are still future. */
12553
12554                     /* adjust RExC_parse so the warning shows after
12555                        the class closes */
12556                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12557                         RExC_parse++;
12558                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12559                 }
12560             } else {
12561                 /* Maternal grandfather:
12562                  * "[:" ending in ":" but not in ":]" */
12563                 if (strict) {
12564                     vFAIL("Unmatched '[' in POSIX class");
12565                 }
12566
12567                 /* Grandfather lone [:, [=, [. */
12568                 RExC_parse = s;
12569             }
12570         }
12571     }
12572
12573     return namedclass;
12574 }
12575
12576 STATIC bool
12577 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12578 {
12579     /* This applies some heuristics at the current parse position (which should
12580      * be at a '[') to see if what follows might be intended to be a [:posix:]
12581      * class.  It returns true if it really is a posix class, of course, but it
12582      * also can return true if it thinks that what was intended was a posix
12583      * class that didn't quite make it.
12584      *
12585      * It will return true for
12586      *      [:alphanumerics:
12587      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12588      *                         ')' indicating the end of the (?[
12589      *      [:any garbage including %^&$ punctuation:]
12590      *
12591      * This is designed to be called only from S_handle_regex_sets; it could be
12592      * easily adapted to be called from the spot at the beginning of regclass()
12593      * that checks to see in a normal bracketed class if the surrounding []
12594      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12595      * change long-standing behavior, so I (khw) didn't do that */
12596     char* p = RExC_parse + 1;
12597     char first_char = *p;
12598
12599     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12600
12601     assert(*(p - 1) == '[');
12602
12603     if (! POSIXCC(first_char)) {
12604         return FALSE;
12605     }
12606
12607     p++;
12608     while (p < RExC_end && isWORDCHAR(*p)) p++;
12609
12610     if (p >= RExC_end) {
12611         return FALSE;
12612     }
12613
12614     if (p - RExC_parse > 2    /* Got at least 1 word character */
12615         && (*p == first_char
12616             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12617     {
12618         return TRUE;
12619     }
12620
12621     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12622
12623     return (p
12624             && p - RExC_parse > 2 /* [:] evaluates to colon;
12625                                       [::] is a bad posix class. */
12626             && first_char == *(p - 1));
12627 }
12628
12629 STATIC regnode *
12630 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12631                     I32 *flagp, U32 depth,
12632                     char * const oregcomp_parse)
12633 {
12634     /* Handle the (?[...]) construct to do set operations */
12635
12636     U8 curchar;
12637     UV start, end;      /* End points of code point ranges */
12638     SV* result_string;
12639     char *save_end, *save_parse;
12640     SV* final;
12641     STRLEN len;
12642     regnode* node;
12643     AV* stack;
12644     const bool save_fold = FOLD;
12645
12646     GET_RE_DEBUG_FLAGS_DECL;
12647
12648     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12649
12650     if (LOC) {
12651         vFAIL("(?[...]) not valid in locale");
12652     }
12653     RExC_uni_semantics = 1;
12654
12655     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12656      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12657      * call regclass to handle '[]' so as to not have to reinvent its parsing
12658      * rules here (throwing away the size it computes each time).  And, we exit
12659      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12660      * these things, we need to realize that something preceded by a backslash
12661      * is escaped, so we have to keep track of backslashes */
12662     if (SIZE_ONLY) {
12663         UV depth = 0; /* how many nested (?[...]) constructs */
12664
12665         Perl_ck_warner_d(aTHX_
12666             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12667             "The regex_sets feature is experimental" REPORT_LOCATION,
12668                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12669                 UTF8fARG(UTF,
12670                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12671                          RExC_precomp + (RExC_parse - RExC_precomp)));
12672
12673         while (RExC_parse < RExC_end) {
12674             SV* current = NULL;
12675             RExC_parse = regpatws(pRExC_state, RExC_parse,
12676                                           TRUE); /* means recognize comments */
12677             switch (*RExC_parse) {
12678                 case '?':
12679                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12680                     /* FALLTHROUGH */
12681                 default:
12682                     break;
12683                 case '\\':
12684                     /* Skip the next byte (which could cause us to end up in
12685                      * the middle of a UTF-8 character, but since none of those
12686                      * are confusable with anything we currently handle in this
12687                      * switch (invariants all), it's safe.  We'll just hit the
12688                      * default: case next time and keep on incrementing until
12689                      * we find one of the invariants we do handle. */
12690                     RExC_parse++;
12691                     break;
12692                 case '[':
12693                 {
12694                     /* If this looks like it is a [:posix:] class, leave the
12695                      * parse pointer at the '[' to fool regclass() into
12696                      * thinking it is part of a '[[:posix:]]'.  That function
12697                      * will use strict checking to force a syntax error if it
12698                      * doesn't work out to a legitimate class */
12699                     bool is_posix_class
12700                                     = could_it_be_a_POSIX_class(pRExC_state);
12701                     if (! is_posix_class) {
12702                         RExC_parse++;
12703                     }
12704
12705                     /* regclass() can only return RESTART_UTF8 if multi-char
12706                        folds are allowed.  */
12707                     if (!regclass(pRExC_state, flagp,depth+1,
12708                                   is_posix_class, /* parse the whole char
12709                                                      class only if not a
12710                                                      posix class */
12711                                   FALSE, /* don't allow multi-char folds */
12712                                   TRUE, /* silence non-portable warnings. */
12713                                   &current))
12714                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12715                               (UV) *flagp);
12716
12717                     /* function call leaves parse pointing to the ']', except
12718                      * if we faked it */
12719                     if (is_posix_class) {
12720                         RExC_parse--;
12721                     }
12722
12723                     SvREFCNT_dec(current);   /* In case it returned something */
12724                     break;
12725                 }
12726
12727                 case ']':
12728                     if (depth--) break;
12729                     RExC_parse++;
12730                     if (RExC_parse < RExC_end
12731                         && *RExC_parse == ')')
12732                     {
12733                         node = reganode(pRExC_state, ANYOF, 0);
12734                         RExC_size += ANYOF_SKIP;
12735                         nextchar(pRExC_state);
12736                         Set_Node_Length(node,
12737                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12738                         return node;
12739                     }
12740                     goto no_close;
12741             }
12742             RExC_parse++;
12743         }
12744
12745         no_close:
12746         FAIL("Syntax error in (?[...])");
12747     }
12748
12749     /* Pass 2 only after this.  Everything in this construct is a
12750      * metacharacter.  Operands begin with either a '\' (for an escape
12751      * sequence), or a '[' for a bracketed character class.  Any other
12752      * character should be an operator, or parenthesis for grouping.  Both
12753      * types of operands are handled by calling regclass() to parse them.  It
12754      * is called with a parameter to indicate to return the computed inversion
12755      * list.  The parsing here is implemented via a stack.  Each entry on the
12756      * stack is a single character representing one of the operators, or the
12757      * '('; or else a pointer to an operand inversion list. */
12758
12759 #define IS_OPERAND(a)  (! SvIOK(a))
12760
12761     /* The stack starts empty.  It is a syntax error if the first thing parsed
12762      * is a binary operator; everything else is pushed on the stack.  When an
12763      * operand is parsed, the top of the stack is examined.  If it is a binary
12764      * operator, the item before it should be an operand, and both are replaced
12765      * by the result of doing that operation on the new operand and the one on
12766      * the stack.   Thus a sequence of binary operands is reduced to a single
12767      * one before the next one is parsed.
12768      *
12769      * A unary operator may immediately follow a binary in the input, for
12770      * example
12771      *      [a] + ! [b]
12772      * When an operand is parsed and the top of the stack is a unary operator,
12773      * the operation is performed, and then the stack is rechecked to see if
12774      * this new operand is part of a binary operation; if so, it is handled as
12775      * above.
12776      *
12777      * A '(' is simply pushed on the stack; it is valid only if the stack is
12778      * empty, or the top element of the stack is an operator or another '('
12779      * (for which the parenthesized expression will become an operand).  By the
12780      * time the corresponding ')' is parsed everything in between should have
12781      * been parsed and evaluated to a single operand (or else is a syntax
12782      * error), and is handled as a regular operand */
12783
12784     sv_2mortal((SV *)(stack = newAV()));
12785
12786     while (RExC_parse < RExC_end) {
12787         I32 top_index = av_tindex(stack);
12788         SV** top_ptr;
12789         SV* current = NULL;
12790
12791         /* Skip white space */
12792         RExC_parse = regpatws(pRExC_state, RExC_parse,
12793                                          TRUE /* means recognize comments */ );
12794         if (RExC_parse >= RExC_end) {
12795             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12796         }
12797         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12798             break;
12799         }
12800
12801         switch (curchar) {
12802
12803             case '?':
12804                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12805                                                safely subtract 1 from
12806                                                RExC_parse in the next clause.
12807                                                If we have something on the
12808                                                stack, we have parsed something
12809                                              */
12810                     && UCHARAT(RExC_parse - 1) == '('
12811                     && RExC_parse < RExC_end)
12812                 {
12813                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12814                      * This happens when we have some thing like
12815                      *
12816                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12817                      *   ...
12818                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12819                      *
12820                      * Here we would be handling the interpolated
12821                      * '$thai_or_lao'.  We handle this by a recursive call to
12822                      * ourselves which returns the inversion list the
12823                      * interpolated expression evaluates to.  We use the flags
12824                      * from the interpolated pattern. */
12825                     U32 save_flags = RExC_flags;
12826                     const char * const save_parse = ++RExC_parse;
12827
12828                     parse_lparen_question_flags(pRExC_state);
12829
12830                     if (RExC_parse == save_parse  /* Makes sure there was at
12831                                                      least one flag (or this
12832                                                      embedding wasn't compiled)
12833                                                    */
12834                         || RExC_parse >= RExC_end - 4
12835                         || UCHARAT(RExC_parse) != ':'
12836                         || UCHARAT(++RExC_parse) != '('
12837                         || UCHARAT(++RExC_parse) != '?'
12838                         || UCHARAT(++RExC_parse) != '[')
12839                     {
12840
12841                         /* In combination with the above, this moves the
12842                          * pointer to the point just after the first erroneous
12843                          * character (or if there are no flags, to where they
12844                          * should have been) */
12845                         if (RExC_parse >= RExC_end - 4) {
12846                             RExC_parse = RExC_end;
12847                         }
12848                         else if (RExC_parse != save_parse) {
12849                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12850                         }
12851                         vFAIL("Expecting '(?flags:(?[...'");
12852                     }
12853                     RExC_parse++;
12854                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12855                                                     depth+1, oregcomp_parse);
12856
12857                     /* Here, 'current' contains the embedded expression's
12858                      * inversion list, and RExC_parse points to the trailing
12859                      * ']'; the next character should be the ')' which will be
12860                      * paired with the '(' that has been put on the stack, so
12861                      * the whole embedded expression reduces to '(operand)' */
12862                     RExC_parse++;
12863
12864                     RExC_flags = save_flags;
12865                     goto handle_operand;
12866                 }
12867                 /* FALLTHROUGH */
12868
12869             default:
12870                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12871                 vFAIL("Unexpected character");
12872
12873             case '\\':
12874                 /* regclass() can only return RESTART_UTF8 if multi-char
12875                    folds are allowed.  */
12876                 if (!regclass(pRExC_state, flagp,depth+1,
12877                               TRUE, /* means parse just the next thing */
12878                               FALSE, /* don't allow multi-char folds */
12879                               FALSE, /* don't silence non-portable warnings.  */
12880                               &current))
12881                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12882                           (UV) *flagp);
12883                 /* regclass() will return with parsing just the \ sequence,
12884                  * leaving the parse pointer at the next thing to parse */
12885                 RExC_parse--;
12886                 goto handle_operand;
12887
12888             case '[':   /* Is a bracketed character class */
12889             {
12890                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12891
12892                 if (! is_posix_class) {
12893                     RExC_parse++;
12894                 }
12895
12896                 /* regclass() can only return RESTART_UTF8 if multi-char
12897                    folds are allowed.  */
12898                 if(!regclass(pRExC_state, flagp,depth+1,
12899                              is_posix_class, /* parse the whole char class
12900                                                 only if not a posix class */
12901                              FALSE, /* don't allow multi-char folds */
12902                              FALSE, /* don't silence non-portable warnings.  */
12903                              &current))
12904                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12905                           (UV) *flagp);
12906                 /* function call leaves parse pointing to the ']', except if we
12907                  * faked it */
12908                 if (is_posix_class) {
12909                     RExC_parse--;
12910                 }
12911
12912                 goto handle_operand;
12913             }
12914
12915             case '&':
12916             case '|':
12917             case '+':
12918             case '-':
12919             case '^':
12920                 if (top_index < 0
12921                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12922                     || ! IS_OPERAND(*top_ptr))
12923                 {
12924                     RExC_parse++;
12925                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12926                 }
12927                 av_push(stack, newSVuv(curchar));
12928                 break;
12929
12930             case '!':
12931                 av_push(stack, newSVuv(curchar));
12932                 break;
12933
12934             case '(':
12935                 if (top_index >= 0) {
12936                     top_ptr = av_fetch(stack, top_index, FALSE);
12937                     assert(top_ptr);
12938                     if (IS_OPERAND(*top_ptr)) {
12939                         RExC_parse++;
12940                         vFAIL("Unexpected '(' with no preceding operator");
12941                     }
12942                 }
12943                 av_push(stack, newSVuv(curchar));
12944                 break;
12945
12946             case ')':
12947             {
12948                 SV* lparen;
12949                 if (top_index < 1
12950                     || ! (current = av_pop(stack))
12951                     || ! IS_OPERAND(current)
12952                     || ! (lparen = av_pop(stack))
12953                     || IS_OPERAND(lparen)
12954                     || SvUV(lparen) != '(')
12955                 {
12956                     SvREFCNT_dec(current);
12957                     RExC_parse++;
12958                     vFAIL("Unexpected ')'");
12959                 }
12960                 top_index -= 2;
12961                 SvREFCNT_dec_NN(lparen);
12962
12963                 /* FALLTHROUGH */
12964             }
12965
12966               handle_operand:
12967
12968                 /* Here, we have an operand to process, in 'current' */
12969
12970                 if (top_index < 0) {    /* Just push if stack is empty */
12971                     av_push(stack, current);
12972                 }
12973                 else {
12974                     SV* top = av_pop(stack);
12975                     SV *prev = NULL;
12976                     char current_operator;
12977
12978                     if (IS_OPERAND(top)) {
12979                         SvREFCNT_dec_NN(top);
12980                         SvREFCNT_dec_NN(current);
12981                         vFAIL("Operand with no preceding operator");
12982                     }
12983                     current_operator = (char) SvUV(top);
12984                     switch (current_operator) {
12985                         case '(':   /* Push the '(' back on followed by the new
12986                                        operand */
12987                             av_push(stack, top);
12988                             av_push(stack, current);
12989                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12990                                                    just after the 'break', so
12991                                                    it doesn't get wrongly freed
12992                                                  */
12993                             break;
12994
12995                         case '!':
12996                             _invlist_invert(current);
12997
12998                             /* Unlike binary operators, the top of the stack,
12999                              * now that this unary one has been popped off, may
13000                              * legally be an operator, and we now have operand
13001                              * for it. */
13002                             top_index--;
13003                             SvREFCNT_dec_NN(top);
13004                             goto handle_operand;
13005
13006                         case '&':
13007                             prev = av_pop(stack);
13008                             _invlist_intersection(prev,
13009                                                    current,
13010                                                    &current);
13011                             av_push(stack, current);
13012                             break;
13013
13014                         case '|':
13015                         case '+':
13016                             prev = av_pop(stack);
13017                             _invlist_union(prev, current, &current);
13018                             av_push(stack, current);
13019                             break;
13020
13021                         case '-':
13022                             prev = av_pop(stack);;
13023                             _invlist_subtract(prev, current, &current);
13024                             av_push(stack, current);
13025                             break;
13026
13027                         case '^':   /* The union minus the intersection */
13028                         {
13029                             SV* i = NULL;
13030                             SV* u = NULL;
13031                             SV* element;
13032
13033                             prev = av_pop(stack);
13034                             _invlist_union(prev, current, &u);
13035                             _invlist_intersection(prev, current, &i);
13036                             /* _invlist_subtract will overwrite current
13037                                 without freeing what it already contains */
13038                             element = current;
13039                             _invlist_subtract(u, i, &current);
13040                             av_push(stack, current);
13041                             SvREFCNT_dec_NN(i);
13042                             SvREFCNT_dec_NN(u);
13043                             SvREFCNT_dec_NN(element);
13044                             break;
13045                         }
13046
13047                         default:
13048                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13049                 }
13050                 SvREFCNT_dec_NN(top);
13051                 SvREFCNT_dec(prev);
13052             }
13053         }
13054
13055         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13056     }
13057
13058     if (av_tindex(stack) < 0   /* Was empty */
13059         || ((final = av_pop(stack)) == NULL)
13060         || ! IS_OPERAND(final)
13061         || av_tindex(stack) >= 0)  /* More left on stack */
13062     {
13063         vFAIL("Incomplete expression within '(?[ ])'");
13064     }
13065
13066     /* Here, 'final' is the resultant inversion list from evaluating the
13067      * expression.  Return it if so requested */
13068     if (return_invlist) {
13069         *return_invlist = final;
13070         return END;
13071     }
13072
13073     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13074      * expecting a string of ranges and individual code points */
13075     invlist_iterinit(final);
13076     result_string = newSVpvs("");
13077     while (invlist_iternext(final, &start, &end)) {
13078         if (start == end) {
13079             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13080         }
13081         else {
13082             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13083                                                      start,          end);
13084         }
13085     }
13086
13087     save_parse = RExC_parse;
13088     RExC_parse = SvPV(result_string, len);
13089     save_end = RExC_end;
13090     RExC_end = RExC_parse + len;
13091
13092     /* We turn off folding around the call, as the class we have constructed
13093      * already has all folding taken into consideration, and we don't want
13094      * regclass() to add to that */
13095     RExC_flags &= ~RXf_PMf_FOLD;
13096     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13097      */
13098     node = regclass(pRExC_state, flagp,depth+1,
13099                     FALSE, /* means parse the whole char class */
13100                     FALSE, /* don't allow multi-char folds */
13101                     TRUE, /* silence non-portable warnings.  The above may very
13102                              well have generated non-portable code points, but
13103                              they're valid on this machine */
13104                     NULL);
13105     if (!node)
13106         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13107                     PTR2UV(flagp));
13108     if (save_fold) {
13109         RExC_flags |= RXf_PMf_FOLD;
13110     }
13111     RExC_parse = save_parse + 1;
13112     RExC_end = save_end;
13113     SvREFCNT_dec_NN(final);
13114     SvREFCNT_dec_NN(result_string);
13115
13116     nextchar(pRExC_state);
13117     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13118     return node;
13119 }
13120 #undef IS_OPERAND
13121
13122 STATIC void
13123 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13124 {
13125     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13126      * innocent-looking character class, like /[ks]/i won't have to go out to
13127      * disk to find the possible matches.
13128      *
13129      * This should be called only for a Latin1-range code points, cp, which is
13130      * known to be involved in a fold with other code points above Latin1.  It
13131      * would give false results if /aa has been specified.  Multi-char folds
13132      * are outside the scope of this, and must be handled specially.
13133      *
13134      * XXX It would be better to generate these via regen, in case a new
13135      * version of the Unicode standard adds new mappings, though that is not
13136      * really likely, and may be caught by the default: case of the switch
13137      * below. */
13138
13139     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13140
13141     switch (cp) {
13142         case 'k':
13143         case 'K':
13144           *invlist =
13145              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13146             break;
13147         case 's':
13148         case 'S':
13149           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13150             break;
13151         case MICRO_SIGN:
13152           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13153           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13154             break;
13155         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13156         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13157           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13158             break;
13159         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13160           *invlist = add_cp_to_invlist(*invlist,
13161                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13162             break;
13163         case LATIN_SMALL_LETTER_SHARP_S:
13164           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13165             break;
13166         case 'F': case 'f':
13167         case 'I': case 'i':
13168         case 'L': case 'l':
13169         case 'T': case 't':
13170         case 'A': case 'a':
13171         case 'H': case 'h':
13172         case 'J': case 'j':
13173         case 'N': case 'n':
13174         case 'W': case 'w':
13175         case 'Y': case 'y':
13176             /* These all are targets of multi-character folds from code points
13177              * that require UTF8 to express, so they can't match unless the
13178              * target string is in UTF-8, so no action here is necessary, as
13179              * regexec.c properly handles the general case for UTF-8 matching
13180              * and multi-char folds */
13181             break;
13182         default:
13183             /* Use deprecated warning to increase the chances of this being
13184              * output */
13185             ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13186             break;
13187     }
13188 }
13189
13190 /* The names of properties whose definitions are not known at compile time are
13191  * stored in this SV, after a constant heading.  So if the length has been
13192  * changed since initialization, then there is a run-time definition. */
13193 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13194                                         (SvCUR(listsv) != initial_listsv_len)
13195
13196 STATIC regnode *
13197 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13198                  const bool stop_at_1,  /* Just parse the next thing, don't
13199                                            look for a full character class */
13200                  bool allow_multi_folds,
13201                  const bool silence_non_portable,   /* Don't output warnings
13202                                                        about too large
13203                                                        characters */
13204                  SV** ret_invlist)  /* Return an inversion list, not a node */
13205 {
13206     /* parse a bracketed class specification.  Most of these will produce an
13207      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13208      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13209      * under /i with multi-character folds: it will be rewritten following the
13210      * paradigm of this example, where the <multi-fold>s are characters which
13211      * fold to multiple character sequences:
13212      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13213      * gets effectively rewritten as:
13214      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13215      * reg() gets called (recursively) on the rewritten version, and this
13216      * function will return what it constructs.  (Actually the <multi-fold>s
13217      * aren't physically removed from the [abcdefghi], it's just that they are
13218      * ignored in the recursion by means of a flag:
13219      * <RExC_in_multi_char_class>.)
13220      *
13221      * ANYOF nodes contain a bit map for the first 256 characters, with the
13222      * corresponding bit set if that character is in the list.  For characters
13223      * above 255, a range list or swash is used.  There are extra bits for \w,
13224      * etc. in locale ANYOFs, as what these match is not determinable at
13225      * compile time
13226      *
13227      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13228      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13229      */
13230
13231     dVAR;
13232     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13233     IV range = 0;
13234     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13235     regnode *ret;
13236     STRLEN numlen;
13237     IV namedclass = OOB_NAMEDCLASS;
13238     char *rangebegin = NULL;
13239     bool need_class = 0;
13240     SV *listsv = NULL;
13241     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13242                                       than just initialized.  */
13243     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13244     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13245                                extended beyond the Latin1 range.  These have to
13246                                be kept separate from other code points for much
13247                                of this function because their handling  is
13248                                different under /i, and for most classes under
13249                                /d as well */
13250     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13251                                separate for a while from the non-complemented
13252                                versions because of complications with /d
13253                                matching */
13254     UV element_count = 0;   /* Number of distinct elements in the class.
13255                                Optimizations may be possible if this is tiny */
13256     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13257                                        character; used under /i */
13258     UV n;
13259     char * stop_ptr = RExC_end;    /* where to stop parsing */
13260     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13261                                                    space? */
13262     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13263
13264     /* Unicode properties are stored in a swash; this holds the current one
13265      * being parsed.  If this swash is the only above-latin1 component of the
13266      * character class, an optimization is to pass it directly on to the
13267      * execution engine.  Otherwise, it is set to NULL to indicate that there
13268      * are other things in the class that have to be dealt with at execution
13269      * time */
13270     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13271
13272     /* Set if a component of this character class is user-defined; just passed
13273      * on to the engine */
13274     bool has_user_defined_property = FALSE;
13275
13276     /* inversion list of code points this node matches only when the target
13277      * string is in UTF-8.  (Because is under /d) */
13278     SV* depends_list = NULL;
13279
13280     /* Inversion list of code points this node matches regardless of things
13281      * like locale, folding, utf8ness of the target string */
13282     SV* cp_list = NULL;
13283
13284     /* Like cp_list, but code points on this list need to be checked for things
13285      * that fold to/from them under /i */
13286     SV* cp_foldable_list = NULL;
13287
13288     /* Like cp_list, but code points on this list are valid only when the
13289      * runtime locale is UTF-8 */
13290     SV* only_utf8_locale_list = NULL;
13291
13292 #ifdef EBCDIC
13293     /* In a range, counts how many 0-2 of the ends of it came from literals,
13294      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13295     UV literal_endpoint = 0;
13296 #endif
13297     bool invert = FALSE;    /* Is this class to be complemented */
13298
13299     bool warn_super = ALWAYS_WARN_SUPER;
13300
13301     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13302         case we need to change the emitted regop to an EXACT. */
13303     const char * orig_parse = RExC_parse;
13304     const SSize_t orig_size = RExC_size;
13305     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13306     GET_RE_DEBUG_FLAGS_DECL;
13307
13308     PERL_ARGS_ASSERT_REGCLASS;
13309 #ifndef DEBUGGING
13310     PERL_UNUSED_ARG(depth);
13311 #endif
13312
13313     DEBUG_PARSE("clas");
13314
13315     /* Assume we are going to generate an ANYOF node. */
13316     ret = reganode(pRExC_state, ANYOF, 0);
13317
13318     if (SIZE_ONLY) {
13319         RExC_size += ANYOF_SKIP;
13320         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13321     }
13322     else {
13323         ANYOF_FLAGS(ret) = 0;
13324
13325         RExC_emit += ANYOF_SKIP;
13326         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13327         initial_listsv_len = SvCUR(listsv);
13328         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13329     }
13330
13331     if (skip_white) {
13332         RExC_parse = regpatws(pRExC_state, RExC_parse,
13333                               FALSE /* means don't recognize comments */ );
13334     }
13335
13336     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13337         RExC_parse++;
13338         invert = TRUE;
13339         allow_multi_folds = FALSE;
13340         RExC_naughty++;
13341         if (skip_white) {
13342             RExC_parse = regpatws(pRExC_state, RExC_parse,
13343                                   FALSE /* means don't recognize comments */ );
13344         }
13345     }
13346
13347     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13348     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13349         const char *s = RExC_parse;
13350         const char  c = *s++;
13351
13352         while (isWORDCHAR(*s))
13353             s++;
13354         if (*s && c == *s && s[1] == ']') {
13355             SAVEFREESV(RExC_rx_sv);
13356             ckWARN3reg(s+2,
13357                        "POSIX syntax [%c %c] belongs inside character classes",
13358                        c, c);
13359             (void)ReREFCNT_inc(RExC_rx_sv);
13360         }
13361     }
13362
13363     /* If the caller wants us to just parse a single element, accomplish this
13364      * by faking the loop ending condition */
13365     if (stop_at_1 && RExC_end > RExC_parse) {
13366         stop_ptr = RExC_parse + 1;
13367     }
13368
13369     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13370     if (UCHARAT(RExC_parse) == ']')
13371         goto charclassloop;
13372
13373 parseit:
13374     while (1) {
13375         if  (RExC_parse >= stop_ptr) {
13376             break;
13377         }
13378
13379         if (skip_white) {
13380             RExC_parse = regpatws(pRExC_state, RExC_parse,
13381                                   FALSE /* means don't recognize comments */ );
13382         }
13383
13384         if  (UCHARAT(RExC_parse) == ']') {
13385             break;
13386         }
13387
13388     charclassloop:
13389
13390         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13391         save_value = value;
13392         save_prevvalue = prevvalue;
13393
13394         if (!range) {
13395             rangebegin = RExC_parse;
13396             element_count++;
13397         }
13398         if (UTF) {
13399             value = utf8n_to_uvchr((U8*)RExC_parse,
13400                                    RExC_end - RExC_parse,
13401                                    &numlen, UTF8_ALLOW_DEFAULT);
13402             RExC_parse += numlen;
13403         }
13404         else
13405             value = UCHARAT(RExC_parse++);
13406
13407         if (value == '['
13408             && RExC_parse < RExC_end
13409             && POSIXCC(UCHARAT(RExC_parse)))
13410         {
13411             namedclass = regpposixcc(pRExC_state, value, strict);
13412         }
13413         else if (value == '\\') {
13414             if (UTF) {
13415                 value = utf8n_to_uvchr((U8*)RExC_parse,
13416                                    RExC_end - RExC_parse,
13417                                    &numlen, UTF8_ALLOW_DEFAULT);
13418                 RExC_parse += numlen;
13419             }
13420             else
13421                 value = UCHARAT(RExC_parse++);
13422
13423             /* Some compilers cannot handle switching on 64-bit integer
13424              * values, therefore value cannot be an UV.  Yes, this will
13425              * be a problem later if we want switch on Unicode.
13426              * A similar issue a little bit later when switching on
13427              * namedclass. --jhi */
13428
13429             /* If the \ is escaping white space when white space is being
13430              * skipped, it means that that white space is wanted literally, and
13431              * is already in 'value'.  Otherwise, need to translate the escape
13432              * into what it signifies. */
13433             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13434
13435             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13436             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13437             case 's':   namedclass = ANYOF_SPACE;       break;
13438             case 'S':   namedclass = ANYOF_NSPACE;      break;
13439             case 'd':   namedclass = ANYOF_DIGIT;       break;
13440             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13441             case 'v':   namedclass = ANYOF_VERTWS;      break;
13442             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13443             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13444             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13445             case 'N':  /* Handle \N{NAME} in class */
13446                 {
13447                     /* We only pay attention to the first char of
13448                     multichar strings being returned. I kinda wonder
13449                     if this makes sense as it does change the behaviour
13450                     from earlier versions, OTOH that behaviour was broken
13451                     as well. */
13452                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13453                                       TRUE, /* => charclass */
13454                                       strict))
13455                     {
13456                         if (*flagp & RESTART_UTF8)
13457                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13458                         goto parseit;
13459                     }
13460                 }
13461                 break;
13462             case 'p':
13463             case 'P':
13464                 {
13465                 char *e;
13466
13467                 /* We will handle any undefined properties ourselves */
13468                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13469                                        /* And we actually would prefer to get
13470                                         * the straight inversion list of the
13471                                         * swash, since we will be accessing it
13472                                         * anyway, to save a little time */
13473                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13474
13475                 if (RExC_parse >= RExC_end)
13476                     vFAIL2("Empty \\%c{}", (U8)value);
13477                 if (*RExC_parse == '{') {
13478                     const U8 c = (U8)value;
13479                     e = strchr(RExC_parse++, '}');
13480                     if (!e)
13481                         vFAIL2("Missing right brace on \\%c{}", c);
13482                     while (isSPACE(*RExC_parse))
13483                         RExC_parse++;
13484                     if (e == RExC_parse)
13485                         vFAIL2("Empty \\%c{}", c);
13486                     n = e - RExC_parse;
13487                     while (isSPACE(*(RExC_parse + n - 1)))
13488                         n--;
13489                 }
13490                 else {
13491                     e = RExC_parse;
13492                     n = 1;
13493                 }
13494                 if (!SIZE_ONLY) {
13495                     SV* invlist;
13496                     char* formatted;
13497                     char* name;
13498
13499                     if (UCHARAT(RExC_parse) == '^') {
13500                          RExC_parse++;
13501                          n--;
13502                          /* toggle.  (The rhs xor gets the single bit that
13503                           * differs between P and p; the other xor inverts just
13504                           * that bit) */
13505                          value ^= 'P' ^ 'p';
13506
13507                          while (isSPACE(*RExC_parse)) {
13508                               RExC_parse++;
13509                               n--;
13510                          }
13511                     }
13512                     /* Try to get the definition of the property into
13513                      * <invlist>.  If /i is in effect, the effective property
13514                      * will have its name be <__NAME_i>.  The design is
13515                      * discussed in commit
13516                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13517                     formatted = Perl_form(aTHX_
13518                                           "%s%.*s%s\n",
13519                                           (FOLD) ? "__" : "",
13520                                           (int)n,
13521                                           RExC_parse,
13522                                           (FOLD) ? "_i" : ""
13523                                 );
13524                     name = savepvn(formatted, strlen(formatted));
13525
13526                     /* Look up the property name, and get its swash and
13527                      * inversion list, if the property is found  */
13528                     if (swash) {
13529                         SvREFCNT_dec_NN(swash);
13530                     }
13531                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13532                                              1, /* binary */
13533                                              0, /* not tr/// */
13534                                              NULL, /* No inversion list */
13535                                              &swash_init_flags
13536                                             );
13537                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13538                         if (swash) {
13539                             SvREFCNT_dec_NN(swash);
13540                             swash = NULL;
13541                         }
13542
13543                         /* Here didn't find it.  It could be a user-defined
13544                          * property that will be available at run-time.  If we
13545                          * accept only compile-time properties, is an error;
13546                          * otherwise add it to the list for run-time look up */
13547                         if (ret_invlist) {
13548                             RExC_parse = e + 1;
13549                             vFAIL2utf8f(
13550                                 "Property '%"UTF8f"' is unknown",
13551                                 UTF8fARG(UTF, n, name));
13552                         }
13553                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13554                                         (value == 'p' ? '+' : '!'),
13555                                         UTF8fARG(UTF, n, name));
13556                         has_user_defined_property = TRUE;
13557
13558                         /* We don't know yet, so have to assume that the
13559                          * property could match something in the Latin1 range,
13560                          * hence something that isn't utf8.  Note that this
13561                          * would cause things in <depends_list> to match
13562                          * inappropriately, except that any \p{}, including
13563                          * this one forces Unicode semantics, which means there
13564                          * is no <depends_list> */
13565                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13566                     }
13567                     else {
13568
13569                         /* Here, did get the swash and its inversion list.  If
13570                          * the swash is from a user-defined property, then this
13571                          * whole character class should be regarded as such */
13572                         if (swash_init_flags
13573                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13574                         {
13575                             has_user_defined_property = TRUE;
13576                         }
13577                         else if
13578                             /* We warn on matching an above-Unicode code point
13579                              * if the match would return true, except don't
13580                              * warn for \p{All}, which has exactly one element
13581                              * = 0 */
13582                             (_invlist_contains_cp(invlist, 0x110000)
13583                                 && (! (_invlist_len(invlist) == 1
13584                                        && *invlist_array(invlist) == 0)))
13585                         {
13586                             warn_super = TRUE;
13587                         }
13588
13589
13590                         /* Invert if asking for the complement */
13591                         if (value == 'P') {
13592                             _invlist_union_complement_2nd(properties,
13593                                                           invlist,
13594                                                           &properties);
13595
13596                             /* The swash can't be used as-is, because we've
13597                              * inverted things; delay removing it to here after
13598                              * have copied its invlist above */
13599                             SvREFCNT_dec_NN(swash);
13600                             swash = NULL;
13601                         }
13602                         else {
13603                             _invlist_union(properties, invlist, &properties);
13604                         }
13605                     }
13606                     Safefree(name);
13607                 }
13608                 RExC_parse = e + 1;
13609                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13610                                                 named */
13611
13612                 /* \p means they want Unicode semantics */
13613                 RExC_uni_semantics = 1;
13614                 }
13615                 break;
13616             case 'n':   value = '\n';                   break;
13617             case 'r':   value = '\r';                   break;
13618             case 't':   value = '\t';                   break;
13619             case 'f':   value = '\f';                   break;
13620             case 'b':   value = '\b';                   break;
13621             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13622             case 'a':   value = '\a';                   break;
13623             case 'o':
13624                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13625                 {
13626                     const char* error_msg;
13627                     bool valid = grok_bslash_o(&RExC_parse,
13628                                                &value,
13629                                                &error_msg,
13630                                                SIZE_ONLY,   /* warnings in pass
13631                                                                1 only */
13632                                                strict,
13633                                                silence_non_portable,
13634                                                UTF);
13635                     if (! valid) {
13636                         vFAIL(error_msg);
13637                     }
13638                 }
13639                 if (PL_encoding && value < 0x100) {
13640                     goto recode_encoding;
13641                 }
13642                 break;
13643             case 'x':
13644                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13645                 {
13646                     const char* error_msg;
13647                     bool valid = grok_bslash_x(&RExC_parse,
13648                                                &value,
13649                                                &error_msg,
13650                                                TRUE, /* Output warnings */
13651                                                strict,
13652                                                silence_non_portable,
13653                                                UTF);
13654                     if (! valid) {
13655                         vFAIL(error_msg);
13656                     }
13657                 }
13658                 if (PL_encoding && value < 0x100)
13659                     goto recode_encoding;
13660                 break;
13661             case 'c':
13662                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13663                 break;
13664             case '0': case '1': case '2': case '3': case '4':
13665             case '5': case '6': case '7':
13666                 {
13667                     /* Take 1-3 octal digits */
13668                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13669                     numlen = (strict) ? 4 : 3;
13670                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13671                     RExC_parse += numlen;
13672                     if (numlen != 3) {
13673                         if (strict) {
13674                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13675                             vFAIL("Need exactly 3 octal digits");
13676                         }
13677                         else if (! SIZE_ONLY /* like \08, \178 */
13678                                  && numlen < 3
13679                                  && RExC_parse < RExC_end
13680                                  && isDIGIT(*RExC_parse)
13681                                  && ckWARN(WARN_REGEXP))
13682                         {
13683                             SAVEFREESV(RExC_rx_sv);
13684                             reg_warn_non_literal_string(
13685                                  RExC_parse + 1,
13686                                  form_short_octal_warning(RExC_parse, numlen));
13687                             (void)ReREFCNT_inc(RExC_rx_sv);
13688                         }
13689                     }
13690                     if (PL_encoding && value < 0x100)
13691                         goto recode_encoding;
13692                     break;
13693                 }
13694             recode_encoding:
13695                 if (! RExC_override_recoding) {
13696                     SV* enc = PL_encoding;
13697                     value = reg_recode((const char)(U8)value, &enc);
13698                     if (!enc) {
13699                         if (strict) {
13700                             vFAIL("Invalid escape in the specified encoding");
13701                         }
13702                         else if (SIZE_ONLY) {
13703                             ckWARNreg(RExC_parse,
13704                                   "Invalid escape in the specified encoding");
13705                         }
13706                     }
13707                     break;
13708                 }
13709             default:
13710                 /* Allow \_ to not give an error */
13711                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13712                     if (strict) {
13713                         vFAIL2("Unrecognized escape \\%c in character class",
13714                                (int)value);
13715                     }
13716                     else {
13717                         SAVEFREESV(RExC_rx_sv);
13718                         ckWARN2reg(RExC_parse,
13719                             "Unrecognized escape \\%c in character class passed through",
13720                             (int)value);
13721                         (void)ReREFCNT_inc(RExC_rx_sv);
13722                     }
13723                 }
13724                 break;
13725             }   /* End of switch on char following backslash */
13726         } /* end of handling backslash escape sequences */
13727 #ifdef EBCDIC
13728         else
13729             literal_endpoint++;
13730 #endif
13731
13732         /* Here, we have the current token in 'value' */
13733
13734         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13735             U8 classnum;
13736
13737             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13738              * literal, as is the character that began the false range, i.e.
13739              * the 'a' in the examples */
13740             if (range) {
13741                 if (!SIZE_ONLY) {
13742                     const int w = (RExC_parse >= rangebegin)
13743                                   ? RExC_parse - rangebegin
13744                                   : 0;
13745                     if (strict) {
13746                         vFAIL2utf8f(
13747                             "False [] range \"%"UTF8f"\"",
13748                             UTF8fARG(UTF, w, rangebegin));
13749                     }
13750                     else {
13751                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13752                         ckWARN2reg(RExC_parse,
13753                             "False [] range \"%"UTF8f"\"",
13754                             UTF8fARG(UTF, w, rangebegin));
13755                         (void)ReREFCNT_inc(RExC_rx_sv);
13756                         cp_list = add_cp_to_invlist(cp_list, '-');
13757                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13758                                                              prevvalue);
13759                     }
13760                 }
13761
13762                 range = 0; /* this was not a true range */
13763                 element_count += 2; /* So counts for three values */
13764             }
13765
13766             classnum = namedclass_to_classnum(namedclass);
13767
13768             if (LOC && namedclass < ANYOF_POSIXL_MAX
13769 #ifndef HAS_ISASCII
13770                 && classnum != _CC_ASCII
13771 #endif
13772             ) {
13773                 /* What the Posix classes (like \w, [:space:]) match in locale
13774                  * isn't knowable under locale until actual match time.  Room
13775                  * must be reserved (one time per outer bracketed class) to
13776                  * store such classes.  The space will contain a bit for each
13777                  * named class that is to be matched against.  This isn't
13778                  * needed for \p{} and pseudo-classes, as they are not affected
13779                  * by locale, and hence are dealt with separately */
13780                 if (! need_class) {
13781                     need_class = 1;
13782                     if (SIZE_ONLY) {
13783                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13784                     }
13785                     else {
13786                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13787                     }
13788                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13789                     ANYOF_POSIXL_ZERO(ret);
13790                 }
13791
13792                 /* See if it already matches the complement of this POSIX
13793                  * class */
13794                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13795                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13796                                                             ? -1
13797                                                             : 1)))
13798                 {
13799                     posixl_matches_all = TRUE;
13800                     break;  /* No need to continue.  Since it matches both
13801                                e.g., \w and \W, it matches everything, and the
13802                                bracketed class can be optimized into qr/./s */
13803                 }
13804
13805                 /* Add this class to those that should be checked at runtime */
13806                 ANYOF_POSIXL_SET(ret, namedclass);
13807
13808                 /* The above-Latin1 characters are not subject to locale rules.
13809                  * Just add them, in the second pass, to the
13810                  * unconditionally-matched list */
13811                 if (! SIZE_ONLY) {
13812                     SV* scratch_list = NULL;
13813
13814                     /* Get the list of the above-Latin1 code points this
13815                      * matches */
13816                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13817                                           PL_XPosix_ptrs[classnum],
13818
13819                                           /* Odd numbers are complements, like
13820                                            * NDIGIT, NASCII, ... */
13821                                           namedclass % 2 != 0,
13822                                           &scratch_list);
13823                     /* Checking if 'cp_list' is NULL first saves an extra
13824                      * clone.  Its reference count will be decremented at the
13825                      * next union, etc, or if this is the only instance, at the
13826                      * end of the routine */
13827                     if (! cp_list) {
13828                         cp_list = scratch_list;
13829                     }
13830                     else {
13831                         _invlist_union(cp_list, scratch_list, &cp_list);
13832                         SvREFCNT_dec_NN(scratch_list);
13833                     }
13834                     continue;   /* Go get next character */
13835                 }
13836             }
13837             else if (! SIZE_ONLY) {
13838
13839                 /* Here, not in pass1 (in that pass we skip calculating the
13840                  * contents of this class), and is /l, or is a POSIX class for
13841                  * which /l doesn't matter (or is a Unicode property, which is
13842                  * skipped here). */
13843                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13844                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13845
13846                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13847                          * nor /l make a difference in what these match,
13848                          * therefore we just add what they match to cp_list. */
13849                         if (classnum != _CC_VERTSPACE) {
13850                             assert(   namedclass == ANYOF_HORIZWS
13851                                    || namedclass == ANYOF_NHORIZWS);
13852
13853                             /* It turns out that \h is just a synonym for
13854                              * XPosixBlank */
13855                             classnum = _CC_BLANK;
13856                         }
13857
13858                         _invlist_union_maybe_complement_2nd(
13859                                 cp_list,
13860                                 PL_XPosix_ptrs[classnum],
13861                                 namedclass % 2 != 0,    /* Complement if odd
13862                                                           (NHORIZWS, NVERTWS)
13863                                                         */
13864                                 &cp_list);
13865                     }
13866                 }
13867                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13868                            complement and use nposixes */
13869                     SV** posixes_ptr = namedclass % 2 == 0
13870                                        ? &posixes
13871                                        : &nposixes;
13872                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13873                     _invlist_union_maybe_complement_2nd(
13874                                                      *posixes_ptr,
13875                                                      *source_ptr,
13876                                                      namedclass % 2 != 0,
13877                                                      posixes_ptr);
13878                 }
13879                 continue;   /* Go get next character */
13880             }
13881         } /* end of namedclass \blah */
13882
13883         /* Here, we have a single value.  If 'range' is set, it is the ending
13884          * of a range--check its validity.  Later, we will handle each
13885          * individual code point in the range.  If 'range' isn't set, this
13886          * could be the beginning of a range, so check for that by looking
13887          * ahead to see if the next real character to be processed is the range
13888          * indicator--the minus sign */
13889
13890         if (skip_white) {
13891             RExC_parse = regpatws(pRExC_state, RExC_parse,
13892                                 FALSE /* means don't recognize comments */ );
13893         }
13894
13895         if (range) {
13896             if (prevvalue > value) /* b-a */ {
13897                 const int w = RExC_parse - rangebegin;
13898                 vFAIL2utf8f(
13899                     "Invalid [] range \"%"UTF8f"\"",
13900                     UTF8fARG(UTF, w, rangebegin));
13901                 range = 0; /* not a valid range */
13902             }
13903         }
13904         else {
13905             prevvalue = value; /* save the beginning of the potential range */
13906             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13907                 && *RExC_parse == '-')
13908             {
13909                 char* next_char_ptr = RExC_parse + 1;
13910                 if (skip_white) {   /* Get the next real char after the '-' */
13911                     next_char_ptr = regpatws(pRExC_state,
13912                                              RExC_parse + 1,
13913                                              FALSE); /* means don't recognize
13914                                                         comments */
13915                 }
13916
13917                 /* If the '-' is at the end of the class (just before the ']',
13918                  * it is a literal minus; otherwise it is a range */
13919                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13920                     RExC_parse = next_char_ptr;
13921
13922                     /* a bad range like \w-, [:word:]- ? */
13923                     if (namedclass > OOB_NAMEDCLASS) {
13924                         if (strict || ckWARN(WARN_REGEXP)) {
13925                             const int w =
13926                                 RExC_parse >= rangebegin ?
13927                                 RExC_parse - rangebegin : 0;
13928                             if (strict) {
13929                                 vFAIL4("False [] range \"%*.*s\"",
13930                                     w, w, rangebegin);
13931                             }
13932                             else {
13933                                 vWARN4(RExC_parse,
13934                                     "False [] range \"%*.*s\"",
13935                                     w, w, rangebegin);
13936                             }
13937                         }
13938                         if (!SIZE_ONLY) {
13939                             cp_list = add_cp_to_invlist(cp_list, '-');
13940                         }
13941                         element_count++;
13942                     } else
13943                         range = 1;      /* yeah, it's a range! */
13944                     continue;   /* but do it the next time */
13945                 }
13946             }
13947         }
13948
13949         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13950          * if not */
13951
13952         /* non-Latin1 code point implies unicode semantics.  Must be set in
13953          * pass1 so is there for the whole of pass 2 */
13954         if (value > 255) {
13955             RExC_uni_semantics = 1;
13956         }
13957
13958         /* Ready to process either the single value, or the completed range.
13959          * For single-valued non-inverted ranges, we consider the possibility
13960          * of multi-char folds.  (We made a conscious decision to not do this
13961          * for the other cases because it can often lead to non-intuitive
13962          * results.  For example, you have the peculiar case that:
13963          *  "s s" =~ /^[^\xDF]+$/i => Y
13964          *  "ss"  =~ /^[^\xDF]+$/i => N
13965          *
13966          * See [perl #89750] */
13967         if (FOLD && allow_multi_folds && value == prevvalue) {
13968             if (value == LATIN_SMALL_LETTER_SHARP_S
13969                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13970                                                         value)))
13971             {
13972                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13973
13974                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13975                 STRLEN foldlen;
13976
13977                 UV folded = _to_uni_fold_flags(
13978                                 value,
13979                                 foldbuf,
13980                                 &foldlen,
13981                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13982                                                    ? FOLD_FLAGS_NOMIX_ASCII
13983                                                    : 0)
13984                                 );
13985
13986                 /* Here, <folded> should be the first character of the
13987                  * multi-char fold of <value>, with <foldbuf> containing the
13988                  * whole thing.  But, if this fold is not allowed (because of
13989                  * the flags), <fold> will be the same as <value>, and should
13990                  * be processed like any other character, so skip the special
13991                  * handling */
13992                 if (folded != value) {
13993
13994                     /* Skip if we are recursed, currently parsing the class
13995                      * again.  Otherwise add this character to the list of
13996                      * multi-char folds. */
13997                     if (! RExC_in_multi_char_class) {
13998                         AV** this_array_ptr;
13999                         AV* this_array;
14000                         STRLEN cp_count = utf8_length(foldbuf,
14001                                                       foldbuf + foldlen);
14002                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
14003
14004                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14005
14006
14007                         if (! multi_char_matches) {
14008                             multi_char_matches = newAV();
14009                         }
14010
14011                         /* <multi_char_matches> is actually an array of arrays.
14012                          * There will be one or two top-level elements: [2],
14013                          * and/or [3].  The [2] element is an array, each
14014                          * element thereof is a character which folds to TWO
14015                          * characters; [3] is for folds to THREE characters.
14016                          * (Unicode guarantees a maximum of 3 characters in any
14017                          * fold.)  When we rewrite the character class below,
14018                          * we will do so such that the longest folds are
14019                          * written first, so that it prefers the longest
14020                          * matching strings first.  This is done even if it
14021                          * turns out that any quantifier is non-greedy, out of
14022                          * programmer laziness.  Tom Christiansen has agreed
14023                          * that this is ok.  This makes the test for the
14024                          * ligature 'ffi' come before the test for 'ff' */
14025                         if (av_exists(multi_char_matches, cp_count)) {
14026                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14027                                                              cp_count, FALSE);
14028                             this_array = *this_array_ptr;
14029                         }
14030                         else {
14031                             this_array = newAV();
14032                             av_store(multi_char_matches, cp_count,
14033                                      (SV*) this_array);
14034                         }
14035                         av_push(this_array, multi_fold);
14036                     }
14037
14038                     /* This element should not be processed further in this
14039                      * class */
14040                     element_count--;
14041                     value = save_value;
14042                     prevvalue = save_prevvalue;
14043                     continue;
14044                 }
14045             }
14046         }
14047
14048         /* Deal with this element of the class */
14049         if (! SIZE_ONLY) {
14050 #ifndef EBCDIC
14051             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14052                                                      prevvalue, value);
14053 #else
14054             SV* this_range = _new_invlist(1);
14055             _append_range_to_invlist(this_range, prevvalue, value);
14056
14057             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14058              * If this range was specified using something like 'i-j', we want
14059              * to include only the 'i' and the 'j', and not anything in
14060              * between, so exclude non-ASCII, non-alphabetics from it.
14061              * However, if the range was specified with something like
14062              * [\x89-\x91] or [\x89-j], all code points within it should be
14063              * included.  literal_endpoint==2 means both ends of the range used
14064              * a literal character, not \x{foo} */
14065             if (literal_endpoint == 2
14066                 && ((prevvalue >= 'a' && value <= 'z')
14067                     || (prevvalue >= 'A' && value <= 'Z')))
14068             {
14069                 _invlist_intersection(this_range, PL_ASCII,
14070                                       &this_range);
14071
14072                 /* Since this above only contains ascii, the intersection of it
14073                  * with anything will still yield only ascii */
14074                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14075                                       &this_range);
14076             }
14077             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14078             literal_endpoint = 0;
14079 #endif
14080         }
14081
14082         range = 0; /* this range (if it was one) is done now */
14083     } /* End of loop through all the text within the brackets */
14084
14085     /* If anything in the class expands to more than one character, we have to
14086      * deal with them by building up a substitute parse string, and recursively
14087      * calling reg() on it, instead of proceeding */
14088     if (multi_char_matches) {
14089         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14090         I32 cp_count;
14091         STRLEN len;
14092         char *save_end = RExC_end;
14093         char *save_parse = RExC_parse;
14094         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14095                                        a "|" */
14096         I32 reg_flags;
14097
14098         assert(! invert);
14099 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14100            because too confusing */
14101         if (invert) {
14102             sv_catpv(substitute_parse, "(?:");
14103         }
14104 #endif
14105
14106         /* Look at the longest folds first */
14107         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14108
14109             if (av_exists(multi_char_matches, cp_count)) {
14110                 AV** this_array_ptr;
14111                 SV* this_sequence;
14112
14113                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14114                                                  cp_count, FALSE);
14115                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14116                                                                 &PL_sv_undef)
14117                 {
14118                     if (! first_time) {
14119                         sv_catpv(substitute_parse, "|");
14120                     }
14121                     first_time = FALSE;
14122
14123                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14124                 }
14125             }
14126         }
14127
14128         /* If the character class contains anything else besides these
14129          * multi-character folds, have to include it in recursive parsing */
14130         if (element_count) {
14131             sv_catpv(substitute_parse, "|[");
14132             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14133             sv_catpv(substitute_parse, "]");
14134         }
14135
14136         sv_catpv(substitute_parse, ")");
14137 #if 0
14138         if (invert) {
14139             /* This is a way to get the parse to skip forward a whole named
14140              * sequence instead of matching the 2nd character when it fails the
14141              * first */
14142             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14143         }
14144 #endif
14145
14146         RExC_parse = SvPV(substitute_parse, len);
14147         RExC_end = RExC_parse + len;
14148         RExC_in_multi_char_class = 1;
14149         RExC_emit = (regnode *)orig_emit;
14150
14151         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14152
14153         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14154
14155         RExC_parse = save_parse;
14156         RExC_end = save_end;
14157         RExC_in_multi_char_class = 0;
14158         SvREFCNT_dec_NN(multi_char_matches);
14159         return ret;
14160     }
14161
14162     /* Here, we've gone through the entire class and dealt with multi-char
14163      * folds.  We are now in a position that we can do some checks to see if we
14164      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14165      * Currently we only do two checks:
14166      * 1) is in the unlikely event that the user has specified both, eg. \w and
14167      *    \W under /l, then the class matches everything.  (This optimization
14168      *    is done only to make the optimizer code run later work.)
14169      * 2) if the character class contains only a single element (including a
14170      *    single range), we see if there is an equivalent node for it.
14171      * Other checks are possible */
14172     if (! ret_invlist   /* Can't optimize if returning the constructed
14173                            inversion list */
14174         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14175     {
14176         U8 op = END;
14177         U8 arg = 0;
14178
14179         if (UNLIKELY(posixl_matches_all)) {
14180             op = SANY;
14181         }
14182         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14183                                                    \w or [:digit:] or \p{foo}
14184                                                  */
14185
14186             /* All named classes are mapped into POSIXish nodes, with its FLAG
14187              * argument giving which class it is */
14188             switch ((I32)namedclass) {
14189                 case ANYOF_UNIPROP:
14190                     break;
14191
14192                 /* These don't depend on the charset modifiers.  They always
14193                  * match under /u rules */
14194                 case ANYOF_NHORIZWS:
14195                 case ANYOF_HORIZWS:
14196                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14197                     /* FALLTHROUGH */
14198
14199                 case ANYOF_NVERTWS:
14200                 case ANYOF_VERTWS:
14201                     op = POSIXU;
14202                     goto join_posix;
14203
14204                 /* The actual POSIXish node for all the rest depends on the
14205                  * charset modifier.  The ones in the first set depend only on
14206                  * ASCII or, if available on this platform, locale */
14207                 case ANYOF_ASCII:
14208                 case ANYOF_NASCII:
14209 #ifdef HAS_ISASCII
14210                     op = (LOC) ? POSIXL : POSIXA;
14211 #else
14212                     op = POSIXA;
14213 #endif
14214                     goto join_posix;
14215
14216                 case ANYOF_NCASED:
14217                 case ANYOF_LOWER:
14218                 case ANYOF_NLOWER:
14219                 case ANYOF_UPPER:
14220                 case ANYOF_NUPPER:
14221                     /* under /a could be alpha */
14222                     if (FOLD) {
14223                         if (ASCII_RESTRICTED) {
14224                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14225                         }
14226                         else if (! LOC) {
14227                             break;
14228                         }
14229                     }
14230                     /* FALLTHROUGH */
14231
14232                 /* The rest have more possibilities depending on the charset.
14233                  * We take advantage of the enum ordering of the charset
14234                  * modifiers to get the exact node type, */
14235                 default:
14236                     op = POSIXD + get_regex_charset(RExC_flags);
14237                     if (op > POSIXA) { /* /aa is same as /a */
14238                         op = POSIXA;
14239                     }
14240
14241                 join_posix:
14242                     /* The odd numbered ones are the complements of the
14243                      * next-lower even number one */
14244                     if (namedclass % 2 == 1) {
14245                         invert = ! invert;
14246                         namedclass--;
14247                     }
14248                     arg = namedclass_to_classnum(namedclass);
14249                     break;
14250             }
14251         }
14252         else if (value == prevvalue) {
14253
14254             /* Here, the class consists of just a single code point */
14255
14256             if (invert) {
14257                 if (! LOC && value == '\n') {
14258                     op = REG_ANY; /* Optimize [^\n] */
14259                     *flagp |= HASWIDTH|SIMPLE;
14260                     RExC_naughty++;
14261                 }
14262             }
14263             else if (value < 256 || UTF) {
14264
14265                 /* Optimize a single value into an EXACTish node, but not if it
14266                  * would require converting the pattern to UTF-8. */
14267                 op = compute_EXACTish(pRExC_state);
14268             }
14269         } /* Otherwise is a range */
14270         else if (! LOC) {   /* locale could vary these */
14271             if (prevvalue == '0') {
14272                 if (value == '9') {
14273                     arg = _CC_DIGIT;
14274                     op = POSIXA;
14275                 }
14276             }
14277             else if (prevvalue == 'A') {
14278                 if (value == 'Z'
14279 #ifdef EBCDIC
14280                     && literal_endpoint == 2
14281 #endif
14282                 ) {
14283                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14284                     op = POSIXA;
14285                 }
14286             }
14287             else if (prevvalue == 'a') {
14288                 if (value == 'z'
14289 #ifdef EBCDIC
14290                     && literal_endpoint == 2
14291 #endif
14292                 ) {
14293                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14294                     op = POSIXA;
14295                 }
14296             }
14297         }
14298
14299         /* Here, we have changed <op> away from its initial value iff we found
14300          * an optimization */
14301         if (op != END) {
14302
14303             /* Throw away this ANYOF regnode, and emit the calculated one,
14304              * which should correspond to the beginning, not current, state of
14305              * the parse */
14306             const char * cur_parse = RExC_parse;
14307             RExC_parse = (char *)orig_parse;
14308             if ( SIZE_ONLY) {
14309                 if (! LOC) {
14310
14311                     /* To get locale nodes to not use the full ANYOF size would
14312                      * require moving the code above that writes the portions
14313                      * of it that aren't in other nodes to after this point.
14314                      * e.g.  ANYOF_POSIXL_SET */
14315                     RExC_size = orig_size;
14316                 }
14317             }
14318             else {
14319                 RExC_emit = (regnode *)orig_emit;
14320                 if (PL_regkind[op] == POSIXD) {
14321                     if (op == POSIXL) {
14322                         RExC_contains_locale = 1;
14323                     }
14324                     if (invert) {
14325                         op += NPOSIXD - POSIXD;
14326                     }
14327                 }
14328             }
14329
14330             ret = reg_node(pRExC_state, op);
14331
14332             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14333                 if (! SIZE_ONLY) {
14334                     FLAGS(ret) = arg;
14335                 }
14336                 *flagp |= HASWIDTH|SIMPLE;
14337             }
14338             else if (PL_regkind[op] == EXACT) {
14339                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14340                                            TRUE /* downgradable to EXACT */
14341                                            );
14342             }
14343
14344             RExC_parse = (char *) cur_parse;
14345
14346             SvREFCNT_dec(posixes);
14347             SvREFCNT_dec(nposixes);
14348             SvREFCNT_dec(cp_list);
14349             SvREFCNT_dec(cp_foldable_list);
14350             return ret;
14351         }
14352     }
14353
14354     if (SIZE_ONLY)
14355         return ret;
14356     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14357
14358     /* If folding, we calculate all characters that could fold to or from the
14359      * ones already on the list */
14360     if (cp_foldable_list) {
14361         if (FOLD) {
14362             UV start, end;      /* End points of code point ranges */
14363
14364             SV* fold_intersection = NULL;
14365             SV** use_list;
14366
14367             /* Our calculated list will be for Unicode rules.  For locale
14368              * matching, we have to keep a separate list that is consulted at
14369              * runtime only when the locale indicates Unicode rules.  For
14370              * non-locale, we just use to the general list */
14371             if (LOC) {
14372                 use_list = &only_utf8_locale_list;
14373             }
14374             else {
14375                 use_list = &cp_list;
14376             }
14377
14378             /* Only the characters in this class that participate in folds need
14379              * be checked.  Get the intersection of this class and all the
14380              * possible characters that are foldable.  This can quickly narrow
14381              * down a large class */
14382             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14383                                   &fold_intersection);
14384
14385             /* The folds for all the Latin1 characters are hard-coded into this
14386              * program, but we have to go out to disk to get the others. */
14387             if (invlist_highest(cp_foldable_list) >= 256) {
14388
14389                 /* This is a hash that for a particular fold gives all
14390                  * characters that are involved in it */
14391                 if (! PL_utf8_foldclosures) {
14392                     _load_PL_utf8_foldclosures();
14393                 }
14394             }
14395
14396             /* Now look at the foldable characters in this class individually */
14397             invlist_iterinit(fold_intersection);
14398             while (invlist_iternext(fold_intersection, &start, &end)) {
14399                 UV j;
14400
14401                 /* Look at every character in the range */
14402                 for (j = start; j <= end; j++) {
14403                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14404                     STRLEN foldlen;
14405                     SV** listp;
14406
14407                     if (j < 256) {
14408
14409                         if (IS_IN_SOME_FOLD_L1(j)) {
14410
14411                             /* ASCII is always matched; non-ASCII is matched
14412                              * only under Unicode rules (which could happen
14413                              * under /l if the locale is a UTF-8 one */
14414                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14415                                 *use_list = add_cp_to_invlist(*use_list,
14416                                                             PL_fold_latin1[j]);
14417                             }
14418                             else {
14419                                 depends_list =
14420                                  add_cp_to_invlist(depends_list,
14421                                                    PL_fold_latin1[j]);
14422                             }
14423                         }
14424
14425                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14426                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14427                         {
14428                             add_above_Latin1_folds(pRExC_state,
14429                                                    (U8) j,
14430                                                    use_list);
14431                         }
14432                         continue;
14433                     }
14434
14435                     /* Here is an above Latin1 character.  We don't have the
14436                      * rules hard-coded for it.  First, get its fold.  This is
14437                      * the simple fold, as the multi-character folds have been
14438                      * handled earlier and separated out */
14439                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14440                                                         (ASCII_FOLD_RESTRICTED)
14441                                                         ? FOLD_FLAGS_NOMIX_ASCII
14442                                                         : 0);
14443
14444                     /* Single character fold of above Latin1.  Add everything in
14445                     * its fold closure to the list that this node should match.
14446                     * The fold closures data structure is a hash with the keys
14447                     * being the UTF-8 of every character that is folded to, like
14448                     * 'k', and the values each an array of all code points that
14449                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14450                     * Multi-character folds are not included */
14451                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14452                                         (char *) foldbuf, foldlen, FALSE)))
14453                     {
14454                         AV* list = (AV*) *listp;
14455                         IV k;
14456                         for (k = 0; k <= av_tindex(list); k++) {
14457                             SV** c_p = av_fetch(list, k, FALSE);
14458                             UV c;
14459                             if (c_p == NULL) {
14460                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14461                             }
14462                             c = SvUV(*c_p);
14463
14464                             /* /aa doesn't allow folds between ASCII and non- */
14465                             if ((ASCII_FOLD_RESTRICTED
14466                                 && (isASCII(c) != isASCII(j))))
14467                             {
14468                                 continue;
14469                             }
14470
14471                             /* Folds under /l which cross the 255/256 boundary
14472                              * are added to a separate list.  (These are valid
14473                              * only when the locale is UTF-8.) */
14474                             if (c < 256 && LOC) {
14475                                 *use_list = add_cp_to_invlist(*use_list, c);
14476                                 continue;
14477                             }
14478
14479                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14480                             {
14481                                 cp_list = add_cp_to_invlist(cp_list, c);
14482                             }
14483                             else {
14484                                 /* Similarly folds involving non-ascii Latin1
14485                                 * characters under /d are added to their list */
14486                                 depends_list = add_cp_to_invlist(depends_list,
14487                                                                  c);
14488                             }
14489                         }
14490                     }
14491                 }
14492             }
14493             SvREFCNT_dec_NN(fold_intersection);
14494         }
14495
14496         /* Now that we have finished adding all the folds, there is no reason
14497          * to keep the foldable list separate */
14498         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14499         SvREFCNT_dec_NN(cp_foldable_list);
14500     }
14501
14502     /* And combine the result (if any) with any inversion list from posix
14503      * classes.  The lists are kept separate up to now because we don't want to
14504      * fold the classes (folding of those is automatically handled by the swash
14505      * fetching code) */
14506     if (posixes || nposixes) {
14507         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14508             /* Under /a and /aa, nothing above ASCII matches these */
14509             _invlist_intersection(posixes,
14510                                   PL_XPosix_ptrs[_CC_ASCII],
14511                                   &posixes);
14512         }
14513         if (nposixes) {
14514             if (DEPENDS_SEMANTICS) {
14515                 /* Under /d, everything in the upper half of the Latin1 range
14516                  * matches these complements */
14517                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14518             }
14519             else if (AT_LEAST_ASCII_RESTRICTED) {
14520                 /* Under /a and /aa, everything above ASCII matches these
14521                  * complements */
14522                 _invlist_union_complement_2nd(nposixes,
14523                                               PL_XPosix_ptrs[_CC_ASCII],
14524                                               &nposixes);
14525             }
14526             if (posixes) {
14527                 _invlist_union(posixes, nposixes, &posixes);
14528                 SvREFCNT_dec_NN(nposixes);
14529             }
14530             else {
14531                 posixes = nposixes;
14532             }
14533         }
14534         if (! DEPENDS_SEMANTICS) {
14535             if (cp_list) {
14536                 _invlist_union(cp_list, posixes, &cp_list);
14537                 SvREFCNT_dec_NN(posixes);
14538             }
14539             else {
14540                 cp_list = posixes;
14541             }
14542         }
14543         else {
14544             /* Under /d, we put into a separate list the Latin1 things that
14545              * match only when the target string is utf8 */
14546             SV* nonascii_but_latin1_properties = NULL;
14547             _invlist_intersection(posixes, PL_UpperLatin1,
14548                                   &nonascii_but_latin1_properties);
14549             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14550                               &posixes);
14551             if (cp_list) {
14552                 _invlist_union(cp_list, posixes, &cp_list);
14553                 SvREFCNT_dec_NN(posixes);
14554             }
14555             else {
14556                 cp_list = posixes;
14557             }
14558
14559             if (depends_list) {
14560                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14561                                &depends_list);
14562                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14563             }
14564             else {
14565                 depends_list = nonascii_but_latin1_properties;
14566             }
14567         }
14568     }
14569
14570     /* And combine the result (if any) with any inversion list from properties.
14571      * The lists are kept separate up to now so that we can distinguish the two
14572      * in regards to matching above-Unicode.  A run-time warning is generated
14573      * if a Unicode property is matched against a non-Unicode code point. But,
14574      * we allow user-defined properties to match anything, without any warning,
14575      * and we also suppress the warning if there is a portion of the character
14576      * class that isn't a Unicode property, and which matches above Unicode, \W
14577      * or [\x{110000}] for example.
14578      * (Note that in this case, unlike the Posix one above, there is no
14579      * <depends_list>, because having a Unicode property forces Unicode
14580      * semantics */
14581     if (properties) {
14582         if (cp_list) {
14583
14584             /* If it matters to the final outcome, see if a non-property
14585              * component of the class matches above Unicode.  If so, the
14586              * warning gets suppressed.  This is true even if just a single
14587              * such code point is specified, as though not strictly correct if
14588              * another such code point is matched against, the fact that they
14589              * are using above-Unicode code points indicates they should know
14590              * the issues involved */
14591             if (warn_super) {
14592                 warn_super = ! (invert
14593                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14594             }
14595
14596             _invlist_union(properties, cp_list, &cp_list);
14597             SvREFCNT_dec_NN(properties);
14598         }
14599         else {
14600             cp_list = properties;
14601         }
14602
14603         if (warn_super) {
14604             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14605         }
14606     }
14607
14608     /* Here, we have calculated what code points should be in the character
14609      * class.
14610      *
14611      * Now we can see about various optimizations.  Fold calculation (which we
14612      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14613      * would invert to include K, which under /i would match k, which it
14614      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14615      * folded until runtime */
14616
14617     /* If we didn't do folding, it's because some information isn't available
14618      * until runtime; set the run-time fold flag for these.  (We don't have to
14619      * worry about properties folding, as that is taken care of by the swash
14620      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14621      * locales, or the class matches at least one 0-255 range code point */
14622     if (LOC && FOLD) {
14623         if (only_utf8_locale_list) {
14624             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14625         }
14626         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14627                                the list */
14628             UV start, end;
14629             invlist_iterinit(cp_list);
14630             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14631                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14632             }
14633             invlist_iterfinish(cp_list);
14634         }
14635     }
14636
14637     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14638      * at compile time.  Besides not inverting folded locale now, we can't
14639      * invert if there are things such as \w, which aren't known until runtime
14640      * */
14641     if (cp_list
14642         && invert
14643         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14644         && ! depends_list
14645         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14646     {
14647         _invlist_invert(cp_list);
14648
14649         /* Any swash can't be used as-is, because we've inverted things */
14650         if (swash) {
14651             SvREFCNT_dec_NN(swash);
14652             swash = NULL;
14653         }
14654
14655         /* Clear the invert flag since have just done it here */
14656         invert = FALSE;
14657     }
14658
14659     if (ret_invlist) {
14660         *ret_invlist = cp_list;
14661         SvREFCNT_dec(swash);
14662
14663         /* Discard the generated node */
14664         if (SIZE_ONLY) {
14665             RExC_size = orig_size;
14666         }
14667         else {
14668             RExC_emit = orig_emit;
14669         }
14670         return orig_emit;
14671     }
14672
14673     /* Some character classes are equivalent to other nodes.  Such nodes take
14674      * up less room and generally fewer operations to execute than ANYOF nodes.
14675      * Above, we checked for and optimized into some such equivalents for
14676      * certain common classes that are easy to test.  Getting to this point in
14677      * the code means that the class didn't get optimized there.  Since this
14678      * code is only executed in Pass 2, it is too late to save space--it has
14679      * been allocated in Pass 1, and currently isn't given back.  But turning
14680      * things into an EXACTish node can allow the optimizer to join it to any
14681      * adjacent such nodes.  And if the class is equivalent to things like /./,
14682      * expensive run-time swashes can be avoided.  Now that we have more
14683      * complete information, we can find things necessarily missed by the
14684      * earlier code.  I (khw) am not sure how much to look for here.  It would
14685      * be easy, but perhaps too slow, to check any candidates against all the
14686      * node types they could possibly match using _invlistEQ(). */
14687
14688     if (cp_list
14689         && ! invert
14690         && ! depends_list
14691         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14692         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14693
14694            /* We don't optimize if we are supposed to make sure all non-Unicode
14695             * code points raise a warning, as only ANYOF nodes have this check.
14696             * */
14697         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14698     {
14699         UV start, end;
14700         U8 op = END;  /* The optimzation node-type */
14701         const char * cur_parse= RExC_parse;
14702
14703         invlist_iterinit(cp_list);
14704         if (! invlist_iternext(cp_list, &start, &end)) {
14705
14706             /* Here, the list is empty.  This happens, for example, when a
14707              * Unicode property is the only thing in the character class, and
14708              * it doesn't match anything.  (perluniprops.pod notes such
14709              * properties) */
14710             op = OPFAIL;
14711             *flagp |= HASWIDTH|SIMPLE;
14712         }
14713         else if (start == end) {    /* The range is a single code point */
14714             if (! invlist_iternext(cp_list, &start, &end)
14715
14716                     /* Don't do this optimization if it would require changing
14717                      * the pattern to UTF-8 */
14718                 && (start < 256 || UTF))
14719             {
14720                 /* Here, the list contains a single code point.  Can optimize
14721                  * into an EXACTish node */
14722
14723                 value = start;
14724
14725                 if (! FOLD) {
14726                     op = EXACT;
14727                 }
14728                 else if (LOC) {
14729
14730                     /* A locale node under folding with one code point can be
14731                      * an EXACTFL, as its fold won't be calculated until
14732                      * runtime */
14733                     op = EXACTFL;
14734                 }
14735                 else {
14736
14737                     /* Here, we are generally folding, but there is only one
14738                      * code point to match.  If we have to, we use an EXACT
14739                      * node, but it would be better for joining with adjacent
14740                      * nodes in the optimization pass if we used the same
14741                      * EXACTFish node that any such are likely to be.  We can
14742                      * do this iff the code point doesn't participate in any
14743                      * folds.  For example, an EXACTF of a colon is the same as
14744                      * an EXACT one, since nothing folds to or from a colon. */
14745                     if (value < 256) {
14746                         if (IS_IN_SOME_FOLD_L1(value)) {
14747                             op = EXACT;
14748                         }
14749                     }
14750                     else {
14751                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14752                             op = EXACT;
14753                         }
14754                     }
14755
14756                     /* If we haven't found the node type, above, it means we
14757                      * can use the prevailing one */
14758                     if (op == END) {
14759                         op = compute_EXACTish(pRExC_state);
14760                     }
14761                 }
14762             }
14763         }
14764         else if (start == 0) {
14765             if (end == UV_MAX) {
14766                 op = SANY;
14767                 *flagp |= HASWIDTH|SIMPLE;
14768                 RExC_naughty++;
14769             }
14770             else if (end == '\n' - 1
14771                     && invlist_iternext(cp_list, &start, &end)
14772                     && start == '\n' + 1 && end == UV_MAX)
14773             {
14774                 op = REG_ANY;
14775                 *flagp |= HASWIDTH|SIMPLE;
14776                 RExC_naughty++;
14777             }
14778         }
14779         invlist_iterfinish(cp_list);
14780
14781         if (op != END) {
14782             RExC_parse = (char *)orig_parse;
14783             RExC_emit = (regnode *)orig_emit;
14784
14785             ret = reg_node(pRExC_state, op);
14786
14787             RExC_parse = (char *)cur_parse;
14788
14789             if (PL_regkind[op] == EXACT) {
14790                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14791                                            TRUE /* downgradable to EXACT */
14792                                           );
14793             }
14794
14795             SvREFCNT_dec_NN(cp_list);
14796             return ret;
14797         }
14798     }
14799
14800     /* Here, <cp_list> contains all the code points we can determine at
14801      * compile time that match under all conditions.  Go through it, and
14802      * for things that belong in the bitmap, put them there, and delete from
14803      * <cp_list>.  While we are at it, see if everything above 255 is in the
14804      * list, and if so, set a flag to speed up execution */
14805
14806     populate_ANYOF_from_invlist(ret, &cp_list);
14807
14808     if (invert) {
14809         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14810     }
14811
14812     /* Here, the bitmap has been populated with all the Latin1 code points that
14813      * always match.  Can now add to the overall list those that match only
14814      * when the target string is UTF-8 (<depends_list>). */
14815     if (depends_list) {
14816         if (cp_list) {
14817             _invlist_union(cp_list, depends_list, &cp_list);
14818             SvREFCNT_dec_NN(depends_list);
14819         }
14820         else {
14821             cp_list = depends_list;
14822         }
14823         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14824     }
14825
14826     /* If there is a swash and more than one element, we can't use the swash in
14827      * the optimization below. */
14828     if (swash && element_count > 1) {
14829         SvREFCNT_dec_NN(swash);
14830         swash = NULL;
14831     }
14832
14833     set_ANYOF_arg(pRExC_state, ret, cp_list,
14834                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14835                    ? listsv : NULL,
14836                   only_utf8_locale_list,
14837                   swash, has_user_defined_property);
14838
14839     *flagp |= HASWIDTH|SIMPLE;
14840
14841     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14842         RExC_contains_locale = 1;
14843     }
14844
14845     return ret;
14846 }
14847
14848 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14849
14850 STATIC void
14851 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14852                 regnode* const node,
14853                 SV* const cp_list,
14854                 SV* const runtime_defns,
14855                 SV* const only_utf8_locale_list,
14856                 SV* const swash,
14857                 const bool has_user_defined_property)
14858 {
14859     /* Sets the arg field of an ANYOF-type node 'node', using information about
14860      * the node passed-in.  If there is nothing outside the node's bitmap, the
14861      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14862      * the count returned by add_data(), having allocated and stored an array,
14863      * av, that that count references, as follows:
14864      *  av[0] stores the character class description in its textual form.
14865      *        This is used later (regexec.c:Perl_regclass_swash()) to
14866      *        initialize the appropriate swash, and is also useful for dumping
14867      *        the regnode.  This is set to &PL_sv_undef if the textual
14868      *        description is not needed at run-time (as happens if the other
14869      *        elements completely define the class)
14870      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14871      *        computed from av[0].  But if no further computation need be done,
14872      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14873      *  av[2] stores the inversion list of code points that match only if the
14874      *        current locale is UTF-8
14875      *  av[3] stores the cp_list inversion list for use in addition or instead
14876      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14877      *        (Otherwise everything needed is already in av[0] and av[1])
14878      *  av[4] is set if any component of the class is from a user-defined
14879      *        property; used only if av[3] exists */
14880
14881     UV n;
14882
14883     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14884
14885     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14886         assert(! (ANYOF_FLAGS(node)
14887                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14888         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14889     }
14890     else {
14891         AV * const av = newAV();
14892         SV *rv;
14893
14894         assert(ANYOF_FLAGS(node)
14895                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14896
14897         av_store(av, 0, (runtime_defns)
14898                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14899         if (swash) {
14900             assert(cp_list);
14901             av_store(av, 1, swash);
14902             SvREFCNT_dec_NN(cp_list);
14903         }
14904         else {
14905             av_store(av, 1, &PL_sv_undef);
14906             if (cp_list) {
14907                 av_store(av, 3, cp_list);
14908                 av_store(av, 4, newSVuv(has_user_defined_property));
14909             }
14910         }
14911
14912         if (only_utf8_locale_list) {
14913             av_store(av, 2, only_utf8_locale_list);
14914         }
14915         else {
14916             av_store(av, 2, &PL_sv_undef);
14917         }
14918
14919         rv = newRV_noinc(MUTABLE_SV(av));
14920         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14921         RExC_rxi->data->data[n] = (void*)rv;
14922         ARG_SET(node, n);
14923     }
14924 }
14925
14926
14927 /* reg_skipcomment()
14928
14929    Absorbs an /x style # comment from the input stream,
14930    returning a pointer to the first character beyond the comment, or if the
14931    comment terminates the pattern without anything following it, this returns
14932    one past the final character of the pattern (in other words, RExC_end) and
14933    sets the REG_RUN_ON_COMMENT_SEEN flag.
14934
14935    Note it's the callers responsibility to ensure that we are
14936    actually in /x mode
14937
14938 */
14939
14940 PERL_STATIC_INLINE char*
14941 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
14942 {
14943     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14944
14945     assert(*p = '#');
14946
14947     while (p < RExC_end) {
14948         if (*(++p) == '\n') {
14949             return p+1;
14950         }
14951     }
14952
14953     /* we ran off the end of the pattern without ending the comment, so we have
14954      * to add an \n when wrapping */
14955     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14956     return p;
14957 }
14958
14959 /* nextchar()
14960
14961    Advances the parse position, and optionally absorbs
14962    "whitespace" from the inputstream.
14963
14964    Without /x "whitespace" means (?#...) style comments only,
14965    with /x this means (?#...) and # comments and whitespace proper.
14966
14967    Returns the RExC_parse point from BEFORE the scan occurs.
14968
14969    This is the /x friendly way of saying RExC_parse++.
14970 */
14971
14972 STATIC char*
14973 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14974 {
14975     char* const retval = RExC_parse++;
14976
14977     PERL_ARGS_ASSERT_NEXTCHAR;
14978
14979     for (;;) {
14980         if (RExC_end - RExC_parse >= 3
14981             && *RExC_parse == '('
14982             && RExC_parse[1] == '?'
14983             && RExC_parse[2] == '#')
14984         {
14985             while (*RExC_parse != ')') {
14986                 if (RExC_parse == RExC_end)
14987                     FAIL("Sequence (?#... not terminated");
14988                 RExC_parse++;
14989             }
14990             RExC_parse++;
14991             continue;
14992         }
14993         if (RExC_flags & RXf_PMf_EXTENDED) {
14994             char * p = regpatws(pRExC_state, RExC_parse,
14995                                           TRUE); /* means recognize comments */
14996             if (p != RExC_parse) {
14997                 RExC_parse = p;
14998                 continue;
14999             }
15000         }
15001         return retval;
15002     }
15003 }
15004
15005 /*
15006 - reg_node - emit a node
15007 */
15008 STATIC regnode *                        /* Location. */
15009 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15010 {
15011     dVAR;
15012     regnode *ptr;
15013     regnode * const ret = RExC_emit;
15014     GET_RE_DEBUG_FLAGS_DECL;
15015
15016     PERL_ARGS_ASSERT_REG_NODE;
15017
15018     if (SIZE_ONLY) {
15019         SIZE_ALIGN(RExC_size);
15020         RExC_size += 1;
15021         return(ret);
15022     }
15023     if (RExC_emit >= RExC_emit_bound)
15024         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15025                    op, RExC_emit, RExC_emit_bound);
15026
15027     NODE_ALIGN_FILL(ret);
15028     ptr = ret;
15029     FILL_ADVANCE_NODE(ptr, op);
15030 #ifdef RE_TRACK_PATTERN_OFFSETS
15031     if (RExC_offsets) {         /* MJD */
15032         MJD_OFFSET_DEBUG(
15033               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15034               "reg_node", __LINE__,
15035               PL_reg_name[op],
15036               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15037                 ? "Overwriting end of array!\n" : "OK",
15038               (UV)(RExC_emit - RExC_emit_start),
15039               (UV)(RExC_parse - RExC_start),
15040               (UV)RExC_offsets[0]));
15041         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15042     }
15043 #endif
15044     RExC_emit = ptr;
15045     return(ret);
15046 }
15047
15048 /*
15049 - reganode - emit a node with an argument
15050 */
15051 STATIC regnode *                        /* Location. */
15052 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15053 {
15054     dVAR;
15055     regnode *ptr;
15056     regnode * const ret = RExC_emit;
15057     GET_RE_DEBUG_FLAGS_DECL;
15058
15059     PERL_ARGS_ASSERT_REGANODE;
15060
15061     if (SIZE_ONLY) {
15062         SIZE_ALIGN(RExC_size);
15063         RExC_size += 2;
15064         /*
15065            We can't do this:
15066
15067            assert(2==regarglen[op]+1);
15068
15069            Anything larger than this has to allocate the extra amount.
15070            If we changed this to be:
15071
15072            RExC_size += (1 + regarglen[op]);
15073
15074            then it wouldn't matter. Its not clear what side effect
15075            might come from that so its not done so far.
15076            -- dmq
15077         */
15078         return(ret);
15079     }
15080     if (RExC_emit >= RExC_emit_bound)
15081         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15082                    op, RExC_emit, RExC_emit_bound);
15083
15084     NODE_ALIGN_FILL(ret);
15085     ptr = ret;
15086     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15087 #ifdef RE_TRACK_PATTERN_OFFSETS
15088     if (RExC_offsets) {         /* MJD */
15089         MJD_OFFSET_DEBUG(
15090               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15091               "reganode",
15092               __LINE__,
15093               PL_reg_name[op],
15094               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15095               "Overwriting end of array!\n" : "OK",
15096               (UV)(RExC_emit - RExC_emit_start),
15097               (UV)(RExC_parse - RExC_start),
15098               (UV)RExC_offsets[0]));
15099         Set_Cur_Node_Offset;
15100     }
15101 #endif
15102     RExC_emit = ptr;
15103     return(ret);
15104 }
15105
15106 /*
15107 - reguni - emit (if appropriate) a Unicode character
15108 */
15109 PERL_STATIC_INLINE STRLEN
15110 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15111 {
15112     dVAR;
15113
15114     PERL_ARGS_ASSERT_REGUNI;
15115
15116     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15117 }
15118
15119 /*
15120 - reginsert - insert an operator in front of already-emitted operand
15121 *
15122 * Means relocating the operand.
15123 */
15124 STATIC void
15125 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15126 {
15127     dVAR;
15128     regnode *src;
15129     regnode *dst;
15130     regnode *place;
15131     const int offset = regarglen[(U8)op];
15132     const int size = NODE_STEP_REGNODE + offset;
15133     GET_RE_DEBUG_FLAGS_DECL;
15134
15135     PERL_ARGS_ASSERT_REGINSERT;
15136     PERL_UNUSED_ARG(depth);
15137 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15138     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15139     if (SIZE_ONLY) {
15140         RExC_size += size;
15141         return;
15142     }
15143
15144     src = RExC_emit;
15145     RExC_emit += size;
15146     dst = RExC_emit;
15147     if (RExC_open_parens) {
15148         int paren;
15149         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15150         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15151             if ( RExC_open_parens[paren] >= opnd ) {
15152                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15153                 RExC_open_parens[paren] += size;
15154             } else {
15155                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15156             }
15157             if ( RExC_close_parens[paren] >= opnd ) {
15158                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15159                 RExC_close_parens[paren] += size;
15160             } else {
15161                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15162             }
15163         }
15164     }
15165
15166     while (src > opnd) {
15167         StructCopy(--src, --dst, regnode);
15168 #ifdef RE_TRACK_PATTERN_OFFSETS
15169         if (RExC_offsets) {     /* MJD 20010112 */
15170             MJD_OFFSET_DEBUG(
15171                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15172                   "reg_insert",
15173                   __LINE__,
15174                   PL_reg_name[op],
15175                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15176                     ? "Overwriting end of array!\n" : "OK",
15177                   (UV)(src - RExC_emit_start),
15178                   (UV)(dst - RExC_emit_start),
15179                   (UV)RExC_offsets[0]));
15180             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15181             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15182         }
15183 #endif
15184     }
15185
15186
15187     place = opnd;               /* Op node, where operand used to be. */
15188 #ifdef RE_TRACK_PATTERN_OFFSETS
15189     if (RExC_offsets) {         /* MJD */
15190         MJD_OFFSET_DEBUG(
15191               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15192               "reginsert",
15193               __LINE__,
15194               PL_reg_name[op],
15195               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15196               ? "Overwriting end of array!\n" : "OK",
15197               (UV)(place - RExC_emit_start),
15198               (UV)(RExC_parse - RExC_start),
15199               (UV)RExC_offsets[0]));
15200         Set_Node_Offset(place, RExC_parse);
15201         Set_Node_Length(place, 1);
15202     }
15203 #endif
15204     src = NEXTOPER(place);
15205     FILL_ADVANCE_NODE(place, op);
15206     Zero(src, offset, regnode);
15207 }
15208
15209 /*
15210 - regtail - set the next-pointer at the end of a node chain of p to val.
15211 - SEE ALSO: regtail_study
15212 */
15213 /* TODO: All three parms should be const */
15214 STATIC void
15215 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15216                 const regnode *val,U32 depth)
15217 {
15218     dVAR;
15219     regnode *scan;
15220     GET_RE_DEBUG_FLAGS_DECL;
15221
15222     PERL_ARGS_ASSERT_REGTAIL;
15223 #ifndef DEBUGGING
15224     PERL_UNUSED_ARG(depth);
15225 #endif
15226
15227     if (SIZE_ONLY)
15228         return;
15229
15230     /* Find last node. */
15231     scan = p;
15232     for (;;) {
15233         regnode * const temp = regnext(scan);
15234         DEBUG_PARSE_r({
15235             SV * const mysv=sv_newmortal();
15236             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15237             regprop(RExC_rx, mysv, scan, NULL);
15238             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15239                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15240                     (temp == NULL ? "->" : ""),
15241                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15242             );
15243         });
15244         if (temp == NULL)
15245             break;
15246         scan = temp;
15247     }
15248
15249     if (reg_off_by_arg[OP(scan)]) {
15250         ARG_SET(scan, val - scan);
15251     }
15252     else {
15253         NEXT_OFF(scan) = val - scan;
15254     }
15255 }
15256
15257 #ifdef DEBUGGING
15258 /*
15259 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15260 - Look for optimizable sequences at the same time.
15261 - currently only looks for EXACT chains.
15262
15263 This is experimental code. The idea is to use this routine to perform
15264 in place optimizations on branches and groups as they are constructed,
15265 with the long term intention of removing optimization from study_chunk so
15266 that it is purely analytical.
15267
15268 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15269 to control which is which.
15270
15271 */
15272 /* TODO: All four parms should be const */
15273
15274 STATIC U8
15275 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15276                       const regnode *val,U32 depth)
15277 {
15278     dVAR;
15279     regnode *scan;
15280     U8 exact = PSEUDO;
15281 #ifdef EXPERIMENTAL_INPLACESCAN
15282     I32 min = 0;
15283 #endif
15284     GET_RE_DEBUG_FLAGS_DECL;
15285
15286     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15287
15288
15289     if (SIZE_ONLY)
15290         return exact;
15291
15292     /* Find last node. */
15293
15294     scan = p;
15295     for (;;) {
15296         regnode * const temp = regnext(scan);
15297 #ifdef EXPERIMENTAL_INPLACESCAN
15298         if (PL_regkind[OP(scan)] == EXACT) {
15299             bool unfolded_multi_char;   /* Unexamined in this routine */
15300             if (join_exact(pRExC_state, scan, &min,
15301                            &unfolded_multi_char, 1, val, depth+1))
15302                 return EXACT;
15303         }
15304 #endif
15305         if ( exact ) {
15306             switch (OP(scan)) {
15307                 case EXACT:
15308                 case EXACTF:
15309                 case EXACTFA_NO_TRIE:
15310                 case EXACTFA:
15311                 case EXACTFU:
15312                 case EXACTFU_SS:
15313                 case EXACTFL:
15314                         if( exact == PSEUDO )
15315                             exact= OP(scan);
15316                         else if ( exact != OP(scan) )
15317                             exact= 0;
15318                 case NOTHING:
15319                     break;
15320                 default:
15321                     exact= 0;
15322             }
15323         }
15324         DEBUG_PARSE_r({
15325             SV * const mysv=sv_newmortal();
15326             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15327             regprop(RExC_rx, mysv, scan, NULL);
15328             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15329                 SvPV_nolen_const(mysv),
15330                 REG_NODE_NUM(scan),
15331                 PL_reg_name[exact]);
15332         });
15333         if (temp == NULL)
15334             break;
15335         scan = temp;
15336     }
15337     DEBUG_PARSE_r({
15338         SV * const mysv_val=sv_newmortal();
15339         DEBUG_PARSE_MSG("");
15340         regprop(RExC_rx, mysv_val, val, NULL);
15341         PerlIO_printf(Perl_debug_log,
15342                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15343                       SvPV_nolen_const(mysv_val),
15344                       (IV)REG_NODE_NUM(val),
15345                       (IV)(val - scan)
15346         );
15347     });
15348     if (reg_off_by_arg[OP(scan)]) {
15349         ARG_SET(scan, val - scan);
15350     }
15351     else {
15352         NEXT_OFF(scan) = val - scan;
15353     }
15354
15355     return exact;
15356 }
15357 #endif
15358
15359 /*
15360  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15361  */
15362 #ifdef DEBUGGING
15363
15364 static void
15365 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15366 {
15367     int bit;
15368     int set=0;
15369
15370     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15371
15372     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15373         if (flags & (1<<bit)) {
15374             if (!set++ && lead)
15375                 PerlIO_printf(Perl_debug_log, "%s",lead);
15376             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15377         }
15378     }
15379     if (lead)  {
15380         if (set)
15381             PerlIO_printf(Perl_debug_log, "\n");
15382         else
15383             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15384     }
15385 }
15386
15387 static void
15388 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15389 {
15390     int bit;
15391     int set=0;
15392     regex_charset cs;
15393
15394     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15395
15396     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15397         if (flags & (1<<bit)) {
15398             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15399                 continue;
15400             }
15401             if (!set++ && lead)
15402                 PerlIO_printf(Perl_debug_log, "%s",lead);
15403             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15404         }
15405     }
15406     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15407             if (!set++ && lead) {
15408                 PerlIO_printf(Perl_debug_log, "%s",lead);
15409             }
15410             switch (cs) {
15411                 case REGEX_UNICODE_CHARSET:
15412                     PerlIO_printf(Perl_debug_log, "UNICODE");
15413                     break;
15414                 case REGEX_LOCALE_CHARSET:
15415                     PerlIO_printf(Perl_debug_log, "LOCALE");
15416                     break;
15417                 case REGEX_ASCII_RESTRICTED_CHARSET:
15418                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15419                     break;
15420                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15421                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15422                     break;
15423                 default:
15424                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15425                     break;
15426             }
15427     }
15428     if (lead)  {
15429         if (set)
15430             PerlIO_printf(Perl_debug_log, "\n");
15431         else
15432             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15433     }
15434 }
15435 #endif
15436
15437 void
15438 Perl_regdump(pTHX_ const regexp *r)
15439 {
15440 #ifdef DEBUGGING
15441     dVAR;
15442     SV * const sv = sv_newmortal();
15443     SV *dsv= sv_newmortal();
15444     RXi_GET_DECL(r,ri);
15445     GET_RE_DEBUG_FLAGS_DECL;
15446
15447     PERL_ARGS_ASSERT_REGDUMP;
15448
15449     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15450
15451     /* Header fields of interest. */
15452     if (r->anchored_substr) {
15453         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15454             RE_SV_DUMPLEN(r->anchored_substr), 30);
15455         PerlIO_printf(Perl_debug_log,
15456                       "anchored %s%s at %"IVdf" ",
15457                       s, RE_SV_TAIL(r->anchored_substr),
15458                       (IV)r->anchored_offset);
15459     } else if (r->anchored_utf8) {
15460         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15461             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15462         PerlIO_printf(Perl_debug_log,
15463                       "anchored utf8 %s%s at %"IVdf" ",
15464                       s, RE_SV_TAIL(r->anchored_utf8),
15465                       (IV)r->anchored_offset);
15466     }
15467     if (r->float_substr) {
15468         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15469             RE_SV_DUMPLEN(r->float_substr), 30);
15470         PerlIO_printf(Perl_debug_log,
15471                       "floating %s%s at %"IVdf"..%"UVuf" ",
15472                       s, RE_SV_TAIL(r->float_substr),
15473                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15474     } else if (r->float_utf8) {
15475         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15476             RE_SV_DUMPLEN(r->float_utf8), 30);
15477         PerlIO_printf(Perl_debug_log,
15478                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15479                       s, RE_SV_TAIL(r->float_utf8),
15480                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15481     }
15482     if (r->check_substr || r->check_utf8)
15483         PerlIO_printf(Perl_debug_log,
15484                       (const char *)
15485                       (r->check_substr == r->float_substr
15486                        && r->check_utf8 == r->float_utf8
15487                        ? "(checking floating" : "(checking anchored"));
15488     if (r->intflags & PREGf_NOSCAN)
15489         PerlIO_printf(Perl_debug_log, " noscan");
15490     if (r->extflags & RXf_CHECK_ALL)
15491         PerlIO_printf(Perl_debug_log, " isall");
15492     if (r->check_substr || r->check_utf8)
15493         PerlIO_printf(Perl_debug_log, ") ");
15494
15495     if (ri->regstclass) {
15496         regprop(r, sv, ri->regstclass, NULL);
15497         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15498     }
15499     if (r->intflags & PREGf_ANCH) {
15500         PerlIO_printf(Perl_debug_log, "anchored");
15501         if (r->intflags & PREGf_ANCH_BOL)
15502             PerlIO_printf(Perl_debug_log, "(BOL)");
15503         if (r->intflags & PREGf_ANCH_MBOL)
15504             PerlIO_printf(Perl_debug_log, "(MBOL)");
15505         if (r->intflags & PREGf_ANCH_SBOL)
15506             PerlIO_printf(Perl_debug_log, "(SBOL)");
15507         if (r->intflags & PREGf_ANCH_GPOS)
15508             PerlIO_printf(Perl_debug_log, "(GPOS)");
15509         PerlIO_putc(Perl_debug_log, ' ');
15510     }
15511     if (r->intflags & PREGf_GPOS_SEEN)
15512         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15513     if (r->intflags & PREGf_SKIP)
15514         PerlIO_printf(Perl_debug_log, "plus ");
15515     if (r->intflags & PREGf_IMPLICIT)
15516         PerlIO_printf(Perl_debug_log, "implicit ");
15517     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15518     if (r->extflags & RXf_EVAL_SEEN)
15519         PerlIO_printf(Perl_debug_log, "with eval ");
15520     PerlIO_printf(Perl_debug_log, "\n");
15521     DEBUG_FLAGS_r({
15522         regdump_extflags("r->extflags: ",r->extflags);
15523         regdump_intflags("r->intflags: ",r->intflags);
15524     });
15525 #else
15526     PERL_ARGS_ASSERT_REGDUMP;
15527     PERL_UNUSED_CONTEXT;
15528     PERL_UNUSED_ARG(r);
15529 #endif  /* DEBUGGING */
15530 }
15531
15532 /*
15533 - regprop - printable representation of opcode, with run time support
15534 */
15535
15536 void
15537 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15538 {
15539 #ifdef DEBUGGING
15540     dVAR;
15541     int k;
15542
15543     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15544     static const char * const anyofs[] = {
15545 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15546     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15547     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15548     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15549     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15550     || _CC_VERTSPACE != 16
15551   #error Need to adjust order of anyofs[]
15552 #endif
15553         "\\w",
15554         "\\W",
15555         "\\d",
15556         "\\D",
15557         "[:alpha:]",
15558         "[:^alpha:]",
15559         "[:lower:]",
15560         "[:^lower:]",
15561         "[:upper:]",
15562         "[:^upper:]",
15563         "[:punct:]",
15564         "[:^punct:]",
15565         "[:print:]",
15566         "[:^print:]",
15567         "[:alnum:]",
15568         "[:^alnum:]",
15569         "[:graph:]",
15570         "[:^graph:]",
15571         "[:cased:]",
15572         "[:^cased:]",
15573         "\\s",
15574         "\\S",
15575         "[:blank:]",
15576         "[:^blank:]",
15577         "[:xdigit:]",
15578         "[:^xdigit:]",
15579         "[:space:]",
15580         "[:^space:]",
15581         "[:cntrl:]",
15582         "[:^cntrl:]",
15583         "[:ascii:]",
15584         "[:^ascii:]",
15585         "\\v",
15586         "\\V"
15587     };
15588     RXi_GET_DECL(prog,progi);
15589     GET_RE_DEBUG_FLAGS_DECL;
15590
15591     PERL_ARGS_ASSERT_REGPROP;
15592
15593     sv_setpvs(sv, "");
15594
15595     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15596         /* It would be nice to FAIL() here, but this may be called from
15597            regexec.c, and it would be hard to supply pRExC_state. */
15598         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15599                                               (int)OP(o), (int)REGNODE_MAX);
15600     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15601
15602     k = PL_regkind[OP(o)];
15603
15604     if (k == EXACT) {
15605         sv_catpvs(sv, " ");
15606         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15607          * is a crude hack but it may be the best for now since
15608          * we have no flag "this EXACTish node was UTF-8"
15609          * --jhi */
15610         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15611                   PERL_PV_ESCAPE_UNI_DETECT |
15612                   PERL_PV_ESCAPE_NONASCII   |
15613                   PERL_PV_PRETTY_ELLIPSES   |
15614                   PERL_PV_PRETTY_LTGT       |
15615                   PERL_PV_PRETTY_NOCLEAR
15616                   );
15617     } else if (k == TRIE) {
15618         /* print the details of the trie in dumpuntil instead, as
15619          * progi->data isn't available here */
15620         const char op = OP(o);
15621         const U32 n = ARG(o);
15622         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15623                (reg_ac_data *)progi->data->data[n] :
15624                NULL;
15625         const reg_trie_data * const trie
15626             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15627
15628         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15629         DEBUG_TRIE_COMPILE_r(
15630           Perl_sv_catpvf(aTHX_ sv,
15631             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15632             (UV)trie->startstate,
15633             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15634             (UV)trie->wordcount,
15635             (UV)trie->minlen,
15636             (UV)trie->maxlen,
15637             (UV)TRIE_CHARCOUNT(trie),
15638             (UV)trie->uniquecharcount
15639           );
15640         );
15641         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15642             sv_catpvs(sv, "[");
15643             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15644                                                    ? ANYOF_BITMAP(o)
15645                                                    : TRIE_BITMAP(trie));
15646             sv_catpvs(sv, "]");
15647         }
15648
15649     } else if (k == CURLY) {
15650         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15651             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15652         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15653     }
15654     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15655         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15656     else if (k == REF || k == OPEN || k == CLOSE
15657              || k == GROUPP || OP(o)==ACCEPT)
15658     {
15659         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15660         if ( RXp_PAREN_NAMES(prog) ) {
15661             if ( k != REF || (OP(o) < NREF)) {
15662                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15663                 SV **name= av_fetch(list, ARG(o), 0 );
15664                 if (name)
15665                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15666             }
15667             else {
15668                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15669                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15670                 I32 *nums=(I32*)SvPVX(sv_dat);
15671                 SV **name= av_fetch(list, nums[0], 0 );
15672                 I32 n;
15673                 if (name) {
15674                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15675                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15676                                     (n ? "," : ""), (IV)nums[n]);
15677                     }
15678                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15679                 }
15680             }
15681         }
15682         if ( k == REF && reginfo) {
15683             U32 n = ARG(o);  /* which paren pair */
15684             I32 ln = prog->offs[n].start;
15685             if (prog->lastparen < n || ln == -1)
15686                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15687             else if (ln == prog->offs[n].end)
15688                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15689             else {
15690                 const char *s = reginfo->strbeg + ln;
15691                 Perl_sv_catpvf(aTHX_ sv, ": ");
15692                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15693                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15694             }
15695         }
15696     } else if (k == GOSUB)
15697         /* Paren and offset */
15698         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15699     else if (k == VERB) {
15700         if (!o->flags)
15701             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15702                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15703     } else if (k == LOGICAL)
15704         /* 2: embedded, otherwise 1 */
15705         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15706     else if (k == ANYOF) {
15707         const U8 flags = ANYOF_FLAGS(o);
15708         int do_sep = 0;
15709
15710
15711         if (flags & ANYOF_LOCALE_FLAGS)
15712             sv_catpvs(sv, "{loc}");
15713         if (flags & ANYOF_LOC_FOLD)
15714             sv_catpvs(sv, "{i}");
15715         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15716         if (flags & ANYOF_INVERT)
15717             sv_catpvs(sv, "^");
15718
15719         /* output what the standard cp 0-255 bitmap matches */
15720         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15721
15722         /* output any special charclass tests (used entirely under use
15723          * locale) * */
15724         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15725             int i;
15726             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15727                 if (ANYOF_POSIXL_TEST(o,i)) {
15728                     sv_catpv(sv, anyofs[i]);
15729                     do_sep = 1;
15730                 }
15731             }
15732         }
15733
15734         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15735                       |ANYOF_UTF8
15736                       |ANYOF_NONBITMAP_NON_UTF8
15737                       |ANYOF_LOC_FOLD)))
15738         {
15739             if (do_sep) {
15740                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15741                 if (flags & ANYOF_INVERT)
15742                     /*make sure the invert info is in each */
15743                     sv_catpvs(sv, "^");
15744             }
15745
15746             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15747                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15748             }
15749
15750             /* output information about the unicode matching */
15751             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15752                 sv_catpvs(sv, "{unicode_all}");
15753             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15754                 SV *lv; /* Set if there is something outside the bit map. */
15755                 bool byte_output = FALSE;   /* If something in the bitmap has
15756                                                been output */
15757                 SV *only_utf8_locale;
15758
15759                 /* Get the stuff that wasn't in the bitmap */
15760                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15761                                                     &lv, &only_utf8_locale);
15762                 if (lv && lv != &PL_sv_undef) {
15763                     char *s = savesvpv(lv);
15764                     char * const origs = s;
15765
15766                     while (*s && *s != '\n')
15767                         s++;
15768
15769                     if (*s == '\n') {
15770                         const char * const t = ++s;
15771
15772                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15773                             sv_catpvs(sv, "{outside bitmap}");
15774                         }
15775                         else {
15776                             sv_catpvs(sv, "{utf8}");
15777                         }
15778
15779                         if (byte_output) {
15780                             sv_catpvs(sv, " ");
15781                         }
15782
15783                         while (*s) {
15784                             if (*s == '\n') {
15785
15786                                 /* Truncate very long output */
15787                                 if (s - origs > 256) {
15788                                     Perl_sv_catpvf(aTHX_ sv,
15789                                                 "%.*s...",
15790                                                 (int) (s - origs - 1),
15791                                                 t);
15792                                     goto out_dump;
15793                                 }
15794                                 *s = ' ';
15795                             }
15796                             else if (*s == '\t') {
15797                                 *s = '-';
15798                             }
15799                             s++;
15800                         }
15801                         if (s[-1] == ' ')
15802                             s[-1] = 0;
15803
15804                         sv_catpv(sv, t);
15805                     }
15806
15807                 out_dump:
15808
15809                     Safefree(origs);
15810                     SvREFCNT_dec_NN(lv);
15811                 }
15812
15813                 if ((flags & ANYOF_LOC_FOLD)
15814                      && only_utf8_locale
15815                      && only_utf8_locale != &PL_sv_undef)
15816                 {
15817                     UV start, end;
15818                     int max_entries = 256;
15819
15820                     sv_catpvs(sv, "{utf8 locale}");
15821                     invlist_iterinit(only_utf8_locale);
15822                     while (invlist_iternext(only_utf8_locale,
15823                                             &start, &end)) {
15824                         put_range(sv, start, end);
15825                         max_entries --;
15826                         if (max_entries < 0) {
15827                             sv_catpvs(sv, "...");
15828                             break;
15829                         }
15830                     }
15831                     invlist_iterfinish(only_utf8_locale);
15832                 }
15833             }
15834         }
15835
15836         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15837     }
15838     else if (k == POSIXD || k == NPOSIXD) {
15839         U8 index = FLAGS(o) * 2;
15840         if (index < C_ARRAY_LENGTH(anyofs)) {
15841             if (*anyofs[index] != '[')  {
15842                 sv_catpv(sv, "[");
15843             }
15844             sv_catpv(sv, anyofs[index]);
15845             if (*anyofs[index] != '[')  {
15846                 sv_catpv(sv, "]");
15847             }
15848         }
15849         else {
15850             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15851         }
15852     }
15853     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15854         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15855 #else
15856     PERL_UNUSED_CONTEXT;
15857     PERL_UNUSED_ARG(sv);
15858     PERL_UNUSED_ARG(o);
15859     PERL_UNUSED_ARG(prog);
15860     PERL_UNUSED_ARG(reginfo);
15861 #endif  /* DEBUGGING */
15862 }
15863
15864
15865
15866 SV *
15867 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15868 {                               /* Assume that RE_INTUIT is set */
15869     dVAR;
15870     struct regexp *const prog = ReANY(r);
15871     GET_RE_DEBUG_FLAGS_DECL;
15872
15873     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15874     PERL_UNUSED_CONTEXT;
15875
15876     DEBUG_COMPILE_r(
15877         {
15878             const char * const s = SvPV_nolen_const(prog->check_substr
15879                       ? prog->check_substr : prog->check_utf8);
15880
15881             if (!PL_colorset) reginitcolors();
15882             PerlIO_printf(Perl_debug_log,
15883                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15884                       PL_colors[4],
15885                       prog->check_substr ? "" : "utf8 ",
15886                       PL_colors[5],PL_colors[0],
15887                       s,
15888                       PL_colors[1],
15889                       (strlen(s) > 60 ? "..." : ""));
15890         } );
15891
15892     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15893 }
15894
15895 /*
15896    pregfree()
15897
15898    handles refcounting and freeing the perl core regexp structure. When
15899    it is necessary to actually free the structure the first thing it
15900    does is call the 'free' method of the regexp_engine associated to
15901    the regexp, allowing the handling of the void *pprivate; member
15902    first. (This routine is not overridable by extensions, which is why
15903    the extensions free is called first.)
15904
15905    See regdupe and regdupe_internal if you change anything here.
15906 */
15907 #ifndef PERL_IN_XSUB_RE
15908 void
15909 Perl_pregfree(pTHX_ REGEXP *r)
15910 {
15911     SvREFCNT_dec(r);
15912 }
15913
15914 void
15915 Perl_pregfree2(pTHX_ REGEXP *rx)
15916 {
15917     dVAR;
15918     struct regexp *const r = ReANY(rx);
15919     GET_RE_DEBUG_FLAGS_DECL;
15920
15921     PERL_ARGS_ASSERT_PREGFREE2;
15922
15923     if (r->mother_re) {
15924         ReREFCNT_dec(r->mother_re);
15925     } else {
15926         CALLREGFREE_PVT(rx); /* free the private data */
15927         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15928         Safefree(r->xpv_len_u.xpvlenu_pv);
15929     }
15930     if (r->substrs) {
15931         SvREFCNT_dec(r->anchored_substr);
15932         SvREFCNT_dec(r->anchored_utf8);
15933         SvREFCNT_dec(r->float_substr);
15934         SvREFCNT_dec(r->float_utf8);
15935         Safefree(r->substrs);
15936     }
15937     RX_MATCH_COPY_FREE(rx);
15938 #ifdef PERL_ANY_COW
15939     SvREFCNT_dec(r->saved_copy);
15940 #endif
15941     Safefree(r->offs);
15942     SvREFCNT_dec(r->qr_anoncv);
15943     rx->sv_u.svu_rx = 0;
15944 }
15945
15946 /*  reg_temp_copy()
15947
15948     This is a hacky workaround to the structural issue of match results
15949     being stored in the regexp structure which is in turn stored in
15950     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15951     could be PL_curpm in multiple contexts, and could require multiple
15952     result sets being associated with the pattern simultaneously, such
15953     as when doing a recursive match with (??{$qr})
15954
15955     The solution is to make a lightweight copy of the regexp structure
15956     when a qr// is returned from the code executed by (??{$qr}) this
15957     lightweight copy doesn't actually own any of its data except for
15958     the starp/end and the actual regexp structure itself.
15959
15960 */
15961
15962
15963 REGEXP *
15964 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15965 {
15966     struct regexp *ret;
15967     struct regexp *const r = ReANY(rx);
15968     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15969
15970     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15971
15972     if (!ret_x)
15973         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15974     else {
15975         SvOK_off((SV *)ret_x);
15976         if (islv) {
15977             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15978                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15979                made both spots point to the same regexp body.) */
15980             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15981             assert(!SvPVX(ret_x));
15982             ret_x->sv_u.svu_rx = temp->sv_any;
15983             temp->sv_any = NULL;
15984             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15985             SvREFCNT_dec_NN(temp);
15986             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15987                ing below will not set it. */
15988             SvCUR_set(ret_x, SvCUR(rx));
15989         }
15990     }
15991     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15992        sv_force_normal(sv) is called.  */
15993     SvFAKE_on(ret_x);
15994     ret = ReANY(ret_x);
15995
15996     SvFLAGS(ret_x) |= SvUTF8(rx);
15997     /* We share the same string buffer as the original regexp, on which we
15998        hold a reference count, incremented when mother_re is set below.
15999        The string pointer is copied here, being part of the regexp struct.
16000      */
16001     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16002            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16003     if (r->offs) {
16004         const I32 npar = r->nparens+1;
16005         Newx(ret->offs, npar, regexp_paren_pair);
16006         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16007     }
16008     if (r->substrs) {
16009         Newx(ret->substrs, 1, struct reg_substr_data);
16010         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16011
16012         SvREFCNT_inc_void(ret->anchored_substr);
16013         SvREFCNT_inc_void(ret->anchored_utf8);
16014         SvREFCNT_inc_void(ret->float_substr);
16015         SvREFCNT_inc_void(ret->float_utf8);
16016
16017         /* check_substr and check_utf8, if non-NULL, point to either their
16018            anchored or float namesakes, and don't hold a second reference.  */
16019     }
16020     RX_MATCH_COPIED_off(ret_x);
16021 #ifdef PERL_ANY_COW
16022     ret->saved_copy = NULL;
16023 #endif
16024     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16025     SvREFCNT_inc_void(ret->qr_anoncv);
16026
16027     return ret_x;
16028 }
16029 #endif
16030
16031 /* regfree_internal()
16032
16033    Free the private data in a regexp. This is overloadable by
16034    extensions. Perl takes care of the regexp structure in pregfree(),
16035    this covers the *pprivate pointer which technically perl doesn't
16036    know about, however of course we have to handle the
16037    regexp_internal structure when no extension is in use.
16038
16039    Note this is called before freeing anything in the regexp
16040    structure.
16041  */
16042
16043 void
16044 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16045 {
16046     dVAR;
16047     struct regexp *const r = ReANY(rx);
16048     RXi_GET_DECL(r,ri);
16049     GET_RE_DEBUG_FLAGS_DECL;
16050
16051     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16052
16053     DEBUG_COMPILE_r({
16054         if (!PL_colorset)
16055             reginitcolors();
16056         {
16057             SV *dsv= sv_newmortal();
16058             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16059                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16060             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16061                 PL_colors[4],PL_colors[5],s);
16062         }
16063     });
16064 #ifdef RE_TRACK_PATTERN_OFFSETS
16065     if (ri->u.offsets)
16066         Safefree(ri->u.offsets);             /* 20010421 MJD */
16067 #endif
16068     if (ri->code_blocks) {
16069         int n;
16070         for (n = 0; n < ri->num_code_blocks; n++)
16071             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16072         Safefree(ri->code_blocks);
16073     }
16074
16075     if (ri->data) {
16076         int n = ri->data->count;
16077
16078         while (--n >= 0) {
16079           /* If you add a ->what type here, update the comment in regcomp.h */
16080             switch (ri->data->what[n]) {
16081             case 'a':
16082             case 'r':
16083             case 's':
16084             case 'S':
16085             case 'u':
16086                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16087                 break;
16088             case 'f':
16089                 Safefree(ri->data->data[n]);
16090                 break;
16091             case 'l':
16092             case 'L':
16093                 break;
16094             case 'T':
16095                 { /* Aho Corasick add-on structure for a trie node.
16096                      Used in stclass optimization only */
16097                     U32 refcount;
16098                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16099                     OP_REFCNT_LOCK;
16100                     refcount = --aho->refcount;
16101                     OP_REFCNT_UNLOCK;
16102                     if ( !refcount ) {
16103                         PerlMemShared_free(aho->states);
16104                         PerlMemShared_free(aho->fail);
16105                          /* do this last!!!! */
16106                         PerlMemShared_free(ri->data->data[n]);
16107                         PerlMemShared_free(ri->regstclass);
16108                     }
16109                 }
16110                 break;
16111             case 't':
16112                 {
16113                     /* trie structure. */
16114                     U32 refcount;
16115                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16116                     OP_REFCNT_LOCK;
16117                     refcount = --trie->refcount;
16118                     OP_REFCNT_UNLOCK;
16119                     if ( !refcount ) {
16120                         PerlMemShared_free(trie->charmap);
16121                         PerlMemShared_free(trie->states);
16122                         PerlMemShared_free(trie->trans);
16123                         if (trie->bitmap)
16124                             PerlMemShared_free(trie->bitmap);
16125                         if (trie->jump)
16126                             PerlMemShared_free(trie->jump);
16127                         PerlMemShared_free(trie->wordinfo);
16128                         /* do this last!!!! */
16129                         PerlMemShared_free(ri->data->data[n]);
16130                     }
16131                 }
16132                 break;
16133             default:
16134                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16135                                                     ri->data->what[n]);
16136             }
16137         }
16138         Safefree(ri->data->what);
16139         Safefree(ri->data);
16140     }
16141
16142     Safefree(ri);
16143 }
16144
16145 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16146 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16147 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16148
16149 /*
16150    re_dup - duplicate a regexp.
16151
16152    This routine is expected to clone a given regexp structure. It is only
16153    compiled under USE_ITHREADS.
16154
16155    After all of the core data stored in struct regexp is duplicated
16156    the regexp_engine.dupe method is used to copy any private data
16157    stored in the *pprivate pointer. This allows extensions to handle
16158    any duplication it needs to do.
16159
16160    See pregfree() and regfree_internal() if you change anything here.
16161 */
16162 #if defined(USE_ITHREADS)
16163 #ifndef PERL_IN_XSUB_RE
16164 void
16165 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16166 {
16167     dVAR;
16168     I32 npar;
16169     const struct regexp *r = ReANY(sstr);
16170     struct regexp *ret = ReANY(dstr);
16171
16172     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16173
16174     npar = r->nparens+1;
16175     Newx(ret->offs, npar, regexp_paren_pair);
16176     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16177
16178     if (ret->substrs) {
16179         /* Do it this way to avoid reading from *r after the StructCopy().
16180            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16181            cache, it doesn't matter.  */
16182         const bool anchored = r->check_substr
16183             ? r->check_substr == r->anchored_substr
16184             : r->check_utf8 == r->anchored_utf8;
16185         Newx(ret->substrs, 1, struct reg_substr_data);
16186         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16187
16188         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16189         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16190         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16191         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16192
16193         /* check_substr and check_utf8, if non-NULL, point to either their
16194            anchored or float namesakes, and don't hold a second reference.  */
16195
16196         if (ret->check_substr) {
16197             if (anchored) {
16198                 assert(r->check_utf8 == r->anchored_utf8);
16199                 ret->check_substr = ret->anchored_substr;
16200                 ret->check_utf8 = ret->anchored_utf8;
16201             } else {
16202                 assert(r->check_substr == r->float_substr);
16203                 assert(r->check_utf8 == r->float_utf8);
16204                 ret->check_substr = ret->float_substr;
16205                 ret->check_utf8 = ret->float_utf8;
16206             }
16207         } else if (ret->check_utf8) {
16208             if (anchored) {
16209                 ret->check_utf8 = ret->anchored_utf8;
16210             } else {
16211                 ret->check_utf8 = ret->float_utf8;
16212             }
16213         }
16214     }
16215
16216     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16217     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16218
16219     if (ret->pprivate)
16220         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16221
16222     if (RX_MATCH_COPIED(dstr))
16223         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16224     else
16225         ret->subbeg = NULL;
16226 #ifdef PERL_ANY_COW
16227     ret->saved_copy = NULL;
16228 #endif
16229
16230     /* Whether mother_re be set or no, we need to copy the string.  We
16231        cannot refrain from copying it when the storage points directly to
16232        our mother regexp, because that's
16233                1: a buffer in a different thread
16234                2: something we no longer hold a reference on
16235                so we need to copy it locally.  */
16236     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16237     ret->mother_re   = NULL;
16238 }
16239 #endif /* PERL_IN_XSUB_RE */
16240
16241 /*
16242    regdupe_internal()
16243
16244    This is the internal complement to regdupe() which is used to copy
16245    the structure pointed to by the *pprivate pointer in the regexp.
16246    This is the core version of the extension overridable cloning hook.
16247    The regexp structure being duplicated will be copied by perl prior
16248    to this and will be provided as the regexp *r argument, however
16249    with the /old/ structures pprivate pointer value. Thus this routine
16250    may override any copying normally done by perl.
16251
16252    It returns a pointer to the new regexp_internal structure.
16253 */
16254
16255 void *
16256 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16257 {
16258     dVAR;
16259     struct regexp *const r = ReANY(rx);
16260     regexp_internal *reti;
16261     int len;
16262     RXi_GET_DECL(r,ri);
16263
16264     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16265
16266     len = ProgLen(ri);
16267
16268     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16269           char, regexp_internal);
16270     Copy(ri->program, reti->program, len+1, regnode);
16271
16272     reti->num_code_blocks = ri->num_code_blocks;
16273     if (ri->code_blocks) {
16274         int n;
16275         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16276                 struct reg_code_block);
16277         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16278                 struct reg_code_block);
16279         for (n = 0; n < ri->num_code_blocks; n++)
16280              reti->code_blocks[n].src_regex = (REGEXP*)
16281                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16282     }
16283     else
16284         reti->code_blocks = NULL;
16285
16286     reti->regstclass = NULL;
16287
16288     if (ri->data) {
16289         struct reg_data *d;
16290         const int count = ri->data->count;
16291         int i;
16292
16293         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16294                 char, struct reg_data);
16295         Newx(d->what, count, U8);
16296
16297         d->count = count;
16298         for (i = 0; i < count; i++) {
16299             d->what[i] = ri->data->what[i];
16300             switch (d->what[i]) {
16301                 /* see also regcomp.h and regfree_internal() */
16302             case 'a': /* actually an AV, but the dup function is identical.  */
16303             case 'r':
16304             case 's':
16305             case 'S':
16306             case 'u': /* actually an HV, but the dup function is identical.  */
16307                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16308                 break;
16309             case 'f':
16310                 /* This is cheating. */
16311                 Newx(d->data[i], 1, regnode_ssc);
16312                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16313                 reti->regstclass = (regnode*)d->data[i];
16314                 break;
16315             case 'T':
16316                 /* Trie stclasses are readonly and can thus be shared
16317                  * without duplication. We free the stclass in pregfree
16318                  * when the corresponding reg_ac_data struct is freed.
16319                  */
16320                 reti->regstclass= ri->regstclass;
16321                 /* FALLTHROUGH */
16322             case 't':
16323                 OP_REFCNT_LOCK;
16324                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16325                 OP_REFCNT_UNLOCK;
16326                 /* FALLTHROUGH */
16327             case 'l':
16328             case 'L':
16329                 d->data[i] = ri->data->data[i];
16330                 break;
16331             default:
16332                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16333                                                            ri->data->what[i]);
16334             }
16335         }
16336
16337         reti->data = d;
16338     }
16339     else
16340         reti->data = NULL;
16341
16342     reti->name_list_idx = ri->name_list_idx;
16343
16344 #ifdef RE_TRACK_PATTERN_OFFSETS
16345     if (ri->u.offsets) {
16346         Newx(reti->u.offsets, 2*len+1, U32);
16347         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16348     }
16349 #else
16350     SetProgLen(reti,len);
16351 #endif
16352
16353     return (void*)reti;
16354 }
16355
16356 #endif    /* USE_ITHREADS */
16357
16358 #ifndef PERL_IN_XSUB_RE
16359
16360 /*
16361  - regnext - dig the "next" pointer out of a node
16362  */
16363 regnode *
16364 Perl_regnext(pTHX_ regnode *p)
16365 {
16366     dVAR;
16367     I32 offset;
16368
16369     if (!p)
16370         return(NULL);
16371
16372     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16373         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16374                                                 (int)OP(p), (int)REGNODE_MAX);
16375     }
16376
16377     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16378     if (offset == 0)
16379         return(NULL);
16380
16381     return(p+offset);
16382 }
16383 #endif
16384
16385 STATIC void
16386 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16387 {
16388     va_list args;
16389     STRLEN l1 = strlen(pat1);
16390     STRLEN l2 = strlen(pat2);
16391     char buf[512];
16392     SV *msv;
16393     const char *message;
16394
16395     PERL_ARGS_ASSERT_RE_CROAK2;
16396
16397     if (l1 > 510)
16398         l1 = 510;
16399     if (l1 + l2 > 510)
16400         l2 = 510 - l1;
16401     Copy(pat1, buf, l1 , char);
16402     Copy(pat2, buf + l1, l2 , char);
16403     buf[l1 + l2] = '\n';
16404     buf[l1 + l2 + 1] = '\0';
16405     va_start(args, pat2);
16406     msv = vmess(buf, &args);
16407     va_end(args);
16408     message = SvPV_const(msv,l1);
16409     if (l1 > 512)
16410         l1 = 512;
16411     Copy(message, buf, l1 , char);
16412     /* l1-1 to avoid \n */
16413     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16414 }
16415
16416 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16417
16418 #ifndef PERL_IN_XSUB_RE
16419 void
16420 Perl_save_re_context(pTHX)
16421 {
16422     dVAR;
16423
16424     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16425     if (PL_curpm) {
16426         const REGEXP * const rx = PM_GETRE(PL_curpm);
16427         if (rx) {
16428             U32 i;
16429             for (i = 1; i <= RX_NPARENS(rx); i++) {
16430                 char digits[TYPE_CHARS(long)];
16431                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16432                                                "%lu", (long)i);
16433                 GV *const *const gvp
16434                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16435
16436                 if (gvp) {
16437                     GV * const gv = *gvp;
16438                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16439                         save_scalar(gv);
16440                 }
16441             }
16442         }
16443     }
16444 }
16445 #endif
16446
16447 #ifdef DEBUGGING
16448
16449 STATIC void
16450 S_put_byte(pTHX_ SV *sv, int c)
16451 {
16452     PERL_ARGS_ASSERT_PUT_BYTE;
16453
16454     if (!isPRINT(c)) {
16455         switch (c) {
16456             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16457             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16458             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16459             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16460             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16461
16462             default:
16463                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16464                 break;
16465         }
16466     }
16467     else {
16468         const char string = c;
16469         if (c == '-' || c == ']' || c == '\\' || c == '^')
16470             sv_catpvs(sv, "\\");
16471         sv_catpvn(sv, &string, 1);
16472     }
16473 }
16474
16475 STATIC void
16476 S_put_range(pTHX_ SV *sv, UV start, UV end)
16477 {
16478
16479     /* Appends to 'sv' a displayable version of the range of code points from
16480      * 'start' to 'end' */
16481
16482     assert(start <= end);
16483
16484     PERL_ARGS_ASSERT_PUT_RANGE;
16485
16486     if (end - start < 3) {  /* Individual chars in short ranges */
16487         for (; start <= end; start++)
16488             put_byte(sv, start);
16489     }
16490     else if (   end > 255
16491              || ! isALPHANUMERIC(start)
16492              || ! isALPHANUMERIC(end)
16493              || isDIGIT(start) != isDIGIT(end)
16494              || isUPPER(start) != isUPPER(end)
16495              || isLOWER(start) != isLOWER(end)
16496
16497                 /* This final test should get optimized out except on EBCDIC
16498                  * platforms, where it causes ranges that cross discontinuities
16499                  * like i/j to be shown as hex instead of the misleading,
16500                  * e.g. H-K (since that range includes more than H, I, J, K).
16501                  * */
16502              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16503     {
16504         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16505                        start,
16506                        (end < 256) ? end : 255);
16507     }
16508     else { /* Here, the ends of the range are both digits, or both uppercase,
16509               or both lowercase; and there's no discontinuity in the range
16510               (which could happen on EBCDIC platforms) */
16511         put_byte(sv, start);
16512         sv_catpvs(sv, "-");
16513         put_byte(sv, end);
16514     }
16515 }
16516
16517 STATIC bool
16518 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16519 {
16520     /* Appends to 'sv' a displayable version of the innards of the bracketed
16521      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16522      * output anything */
16523
16524     int i;
16525     bool has_output_anything = FALSE;
16526
16527     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16528
16529     for (i = 0; i < 256; i++) {
16530         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16531
16532             /* The character at index i should be output.  Find the next
16533              * character that should NOT be output */
16534             int j;
16535             for (j = i + 1; j <= 256; j++) {
16536                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16537                     break;
16538                 }
16539             }
16540
16541             /* Everything between them is a single range that should be output
16542              * */
16543             put_range(sv, i, j - 1);
16544             has_output_anything = TRUE;
16545             i = j;
16546         }
16547     }
16548
16549     return has_output_anything;
16550 }
16551
16552 #define CLEAR_OPTSTART \
16553     if (optstart) STMT_START {                                               \
16554         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16555                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16556         optstart=NULL;                                                       \
16557     } STMT_END
16558
16559 #define DUMPUNTIL(b,e)                                                       \
16560                     CLEAR_OPTSTART;                                          \
16561                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16562
16563 STATIC const regnode *
16564 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16565             const regnode *last, const regnode *plast,
16566             SV* sv, I32 indent, U32 depth)
16567 {
16568     dVAR;
16569     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16570     const regnode *next;
16571     const regnode *optstart= NULL;
16572
16573     RXi_GET_DECL(r,ri);
16574     GET_RE_DEBUG_FLAGS_DECL;
16575
16576     PERL_ARGS_ASSERT_DUMPUNTIL;
16577
16578 #ifdef DEBUG_DUMPUNTIL
16579     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16580         last ? last-start : 0,plast ? plast-start : 0);
16581 #endif
16582
16583     if (plast && plast < last)
16584         last= plast;
16585
16586     while (PL_regkind[op] != END && (!last || node < last)) {
16587         assert(node);
16588         /* While that wasn't END last time... */
16589         NODE_ALIGN(node);
16590         op = OP(node);
16591         if (op == CLOSE || op == WHILEM)
16592             indent--;
16593         next = regnext((regnode *)node);
16594
16595         /* Where, what. */
16596         if (OP(node) == OPTIMIZED) {
16597             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16598                 optstart = node;
16599             else
16600                 goto after_print;
16601         } else
16602             CLEAR_OPTSTART;
16603
16604         regprop(r, sv, node, NULL);
16605         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16606                       (int)(2*indent + 1), "", SvPVX_const(sv));
16607
16608         if (OP(node) != OPTIMIZED) {
16609             if (next == NULL)           /* Next ptr. */
16610                 PerlIO_printf(Perl_debug_log, " (0)");
16611             else if (PL_regkind[(U8)op] == BRANCH
16612                      && PL_regkind[OP(next)] != BRANCH )
16613                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16614             else
16615                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16616             (void)PerlIO_putc(Perl_debug_log, '\n');
16617         }
16618
16619       after_print:
16620         if (PL_regkind[(U8)op] == BRANCHJ) {
16621             assert(next);
16622             {
16623                 const regnode *nnode = (OP(next) == LONGJMP
16624                                        ? regnext((regnode *)next)
16625                                        : next);
16626                 if (last && nnode > last)
16627                     nnode = last;
16628                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16629             }
16630         }
16631         else if (PL_regkind[(U8)op] == BRANCH) {
16632             assert(next);
16633             DUMPUNTIL(NEXTOPER(node), next);
16634         }
16635         else if ( PL_regkind[(U8)op]  == TRIE ) {
16636             const regnode *this_trie = node;
16637             const char op = OP(node);
16638             const U32 n = ARG(node);
16639             const reg_ac_data * const ac = op>=AHOCORASICK ?
16640                (reg_ac_data *)ri->data->data[n] :
16641                NULL;
16642             const reg_trie_data * const trie =
16643                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16644 #ifdef DEBUGGING
16645             AV *const trie_words
16646                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16647 #endif
16648             const regnode *nextbranch= NULL;
16649             I32 word_idx;
16650             sv_setpvs(sv, "");
16651             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16652                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16653
16654                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16655                    (int)(2*(indent+3)), "",
16656                     elem_ptr
16657                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16658                                 SvCUR(*elem_ptr), 60,
16659                                 PL_colors[0], PL_colors[1],
16660                                 (SvUTF8(*elem_ptr)
16661                                  ? PERL_PV_ESCAPE_UNI
16662                                  : 0)
16663                                 | PERL_PV_PRETTY_ELLIPSES
16664                                 | PERL_PV_PRETTY_LTGT
16665                             )
16666                     : "???"
16667                 );
16668                 if (trie->jump) {
16669                     U16 dist= trie->jump[word_idx+1];
16670                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16671                                (UV)((dist ? this_trie + dist : next) - start));
16672                     if (dist) {
16673                         if (!nextbranch)
16674                             nextbranch= this_trie + trie->jump[0];
16675                         DUMPUNTIL(this_trie + dist, nextbranch);
16676                     }
16677                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16678                         nextbranch= regnext((regnode *)nextbranch);
16679                 } else {
16680                     PerlIO_printf(Perl_debug_log, "\n");
16681                 }
16682             }
16683             if (last && next > last)
16684                 node= last;
16685             else
16686                 node= next;
16687         }
16688         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16689             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16690                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16691         }
16692         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16693             assert(next);
16694             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16695         }
16696         else if ( op == PLUS || op == STAR) {
16697             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16698         }
16699         else if (PL_regkind[(U8)op] == ANYOF) {
16700             /* arglen 1 + class block */
16701             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16702                           ? ANYOF_POSIXL_SKIP
16703                           : ANYOF_SKIP);
16704             node = NEXTOPER(node);
16705         }
16706         else if (PL_regkind[(U8)op] == EXACT) {
16707             /* Literal string, where present. */
16708             node += NODE_SZ_STR(node) - 1;
16709             node = NEXTOPER(node);
16710         }
16711         else {
16712             node = NEXTOPER(node);
16713             node += regarglen[(U8)op];
16714         }
16715         if (op == CURLYX || op == OPEN)
16716             indent++;
16717     }
16718     CLEAR_OPTSTART;
16719 #ifdef DEBUG_DUMPUNTIL
16720     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16721 #endif
16722     return node;
16723 }
16724
16725 #endif  /* DEBUGGING */
16726
16727 /*
16728  * Local variables:
16729  * c-indentation-style: bsd
16730  * c-basic-offset: 4
16731  * indent-tabs-mode: nil
16732  * End:
16733  *
16734  * ex: set ts=8 sts=4 sw=4 et:
16735  */