regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105     U32         flags;                  /* RXf_* are we folding, multilining? */
 106     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object
 111                                            pprivate field */
 112     char        *start;                 /* Start of input for compile */
 113     char        *end;                   /* End of input for compile */
 114     char        *parse;                 /* Input-scan pointer. */
 115     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 116     regnode     *emit_start;            /* Start of emitted-code area */
 117     regnode     *emit_bound;            /* First regnode outside of the
 118                                            allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 120                                            implies compiling, so don't emit */
 121     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 122                                            large enough for the largest
 123                                            non-EXACTish node, so can use it as
 124                                            scratch in pass1 */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     SSize_t     size;                   /* Code size. */
 129     I32                npar;            /* Capture buffer count, (OPEN) plus
 130                                            one. ("par" 0 is the whole
 131                                            pattern)*/
 132     I32         nestroot;               /* root parens we are in - used by
 133                                            accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151                                            through */
 152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         contains_i;
 156     I32         override_recoding;
 157     I32         in_multi_char_class;
 158     struct reg_code_block *code_blocks; /* positions of literal (?{})
 159                                             within pattern */
 160     int         num_code_blocks;        /* size of code_blocks[] */
 161     int         code_index;             /* next code_blocks[] slot */
 162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 190                                                          others */
 191 #endif
 192 #define RExC_emit       (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty    (pRExC_state->naughty)
 197 #define RExC_sawback    (pRExC_state->sawback)
 198 #define RExC_seen       (pRExC_state->seen)
 199 #define RExC_size       (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar       (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen   (pRExC_state->extralen)
 204 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 205 #define RExC_utf8       (pRExC_state->utf8)
 206 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 208 #define RExC_open_parens        (pRExC_state->open_parens)
 209 #define RExC_close_parens       (pRExC_state->close_parens)
 210 #define RExC_opend      (pRExC_state->opend)
 211 #define RExC_paren_names        (pRExC_state->paren_names)
 212 #define RExC_recurse    (pRExC_state->recurse)
 213 #define RExC_recurse_count      (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216                                    (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale    (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226         ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8    STMT_START {                                       \
 262                                      if (!UTF) {                           \
 263                                          *flagp = RESTART_UTF8;            \
 264                                          return NULL;                      \
 265                                      }                                     \
 266                         } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288     /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296      /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303     This is the position the string must appear at, or not before.
 304     It also implicitly (when combined with minlenp) tells us how many
 305     characters must match before the string we are searching for.
 306     Likewise when combined with minlenp and the length of the string it
 307     tells us how many characters must appear after the string we have
 308     found.
 309
 310   - max_offset
 311     Only used for floating strings. This is the rightmost point that
 312     the string can appear at. If set to SSize_t_MAX it indicates that the
 313     string can occur infinitely far to the right.
 314
 315   - minlenp
 316     A pointer to the minimum number of characters of the pattern that the
 317     string was found inside. This is important as in the case of positive
 318     lookahead or positive lookbehind we can have multiple patterns
 319     involved. Consider
 320
 321     /(?=FOO).*F/
 322
 323     The minimum length of the pattern overall is 3, the minimum length
 324     of the lookahead part is 3, but the minimum length of the part that
 325     will actually match is 1. So 'FOO's minimum length is 3, but the
 326     minimum length for the F is 1. This is important as the minimum length
 327     is used to determine offsets in front of and behind the string being
 328     looked for.  Since strings can be composites this is the length of the
 329     pattern at the time it was committed with a scan_commit. Note that
 330     the length is calculated by study_chunk, so that the minimum lengths
 331     are not known until the full pattern has been compiled, thus the
 332     pointer to the value.
 333
 334   - lookbehind
 335
 336     In the case of lookbehind the string being searched for can be
 337     offset past the start point of the final matching string.
 338     If this value was just blithely removed from the min_offset it would
 339     invalidate some of the calculations for how many chars must match
 340     before or after (as they are derived from min_offset and minlen and
 341     the length of the string being searched for).
 342     When the final pattern is compiled and the data is moved from the
 343     scan_data_t structure into the regexp structure the information
 344     about lookbehind is factored in, with the information that would
 345     have been lost precalculated in the end_shift field for the
 346     associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354     /*I32 len_min;      unused */
 355     /*I32 len_delta;    unused */
 356     SSize_t pos_min;
 357     SSize_t pos_delta;
 358     SV *last_found;
 359     SSize_t last_end;       /* min value, <0 unless valid. */
 360     SSize_t last_start_min;
 361     SSize_t last_start_max;
 362     SV **longest;           /* Either &l_fixed, or &l_float. */
 363     SV *longest_fixed;      /* longest fixed string found in pattern */
 364     SSize_t offset_fixed;   /* offset where it starts */
 365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367     SV *longest_float;      /* longest floating string found in pattern */
 368     SSize_t offset_float_min; /* earliest point in string it can appear */
 369     SSize_t offset_float_max; /* latest point in string it can appear */
 370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372     I32 flags;
 373     I32 whilem_c;
 374     SSize_t *last_closep;
 375     regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL          0x0001
 405 #define SF_BEFORE_MEOL          0x0002
 406 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL        (+2)
 410 #define SF_FL_SHIFT_EOL         (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF               0x0040
 418 #define SF_HAS_PAR              0x0080
 419 #define SF_IN_PAR               0x0100
 420 #define SF_HAS_EVAL             0x0200
 421 #define SCF_DO_SUBSTR           0x0400
 422 #define SCF_DO_STCLASS_AND      0x0800
 423 #define SCF_DO_STCLASS_OR       0x1000
 424 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS  0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436                                                      == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439                                                      >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS          -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE             0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480                 UTF8fARG(UTF, offset, RExC_precomp), \
 481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {                                        \
 489     const char *ellipses = "";                                          \
 490     IV len = RExC_end - RExC_precomp;                                   \
 491                                                                         \
 492     if (!SIZE_ONLY)                                                     \
 493         SAVEFREESV(RExC_rx_sv);                                         \
 494     if (len > RegexLengthToShowInErrorMessages) {                       \
 495         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 496         len = RegexLengthToShowInErrorMessages - 10;                    \
 497         ellipses = "...";                                               \
 498     }                                                                   \
 499     code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(                            \
 503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 504             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(                       \
 507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 508             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {                                    \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 516             m, REPORT_LOCATION_ARGS(offset));   \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {                           \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL(m);                                    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 532     const IV offset = RExC_parse - RExC_precomp;                        \
 533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 534                       REPORT_LOCATION_ARGS(offset));    \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {                       \
 541     if (!SIZE_ONLY)                                     \
 542         SAVEFREESV(RExC_rx_sv);                         \
 543     Simple_vFAIL2(m, a1);                               \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 553             REPORT_LOCATION_ARGS(offset));      \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {                    \
 560     if (!SIZE_ONLY)                                     \
 561         SAVEFREESV(RExC_rx_sv);                         \
 562     Simple_vFAIL3(m, a1, a2);                           \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 569     const IV offset = RExC_parse - RExC_precomp;                \
 570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 571             REPORT_LOCATION_ARGS(offset));      \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL4(m, a1, a2, a3);                       \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582     const IV offset = RExC_parse - RExC_precomp;   \
 583     if (!SIZE_ONLY)                                \
 584         SAVEFREESV(RExC_rx_sv);                    \
 585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586             REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594             m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {                                   \
 598     const IV offset = loc - RExC_precomp;                               \
 599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 600             REPORT_LOCATION_ARGS(offset));              \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {                                  \
 604     const IV offset = loc - RExC_precomp;                               \
 605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 606             REPORT_LOCATION_ARGS(offset));              \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {                                   \
 610     const IV offset = loc - RExC_precomp;                               \
 611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 612             m REPORT_LOCATION,                                          \
 613             REPORT_LOCATION_ARGS(offset));              \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {                                \
 617     const IV offset = loc - RExC_precomp;                               \
 618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 619             m REPORT_LOCATION,                                          \
 620             REPORT_LOCATION_ARGS(offset));              \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 626             m REPORT_LOCATION,                                          \
 627             a1, REPORT_LOCATION_ARGS(offset));  \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 631     const IV offset = loc - RExC_precomp;                               \
 632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 633             a1, REPORT_LOCATION_ARGS(offset));  \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 637     const IV offset = loc - RExC_precomp;                               \
 638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 639             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 643     const IV offset = loc - RExC_precomp;                               \
 644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 645             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 649     const IV offset = loc - RExC_precomp;                               \
 650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 651             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 655     const IV offset = loc - RExC_precomp;                               \
 656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 657             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 661     const IV offset = loc - RExC_precomp;                               \
 662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 663             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {                  \
 669     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 694     if (! SIZE_ONLY) {                                                  \
 695         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 696                     __LINE__, (int)(node), (int)(byte)));               \
 697         if((node) < 0) {                                                \
 698             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699                                          (int)(node));                  \
 700         } else {                                                        \
 701             RExC_offsets[2*(node)-1] = (byte);                          \
 702         }                                                               \
 703     }                                                                   \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 711     if (! SIZE_ONLY) {                                                  \
 712         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 713                 __LINE__, (int)(node), (int)(len)));                    \
 714         if((node) < 0) {                                                \
 715             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716                                          (int)(node));                  \
 717         } else {                                                        \
 718             RExC_offsets[2*(node)] = (len);                             \
 719         }                                                               \
 720     }                                                                   \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726     Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743         DEBUG_OPTIMISE_MORE_r({                                             \
 744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                                                                             \
 746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                                                                             \
 749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                                                                             \
 752             if (RExC_seen & REG_GPOS_SEEN)                                  \
 753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                                                                             \
 755             if (RExC_seen & REG_CANY_SEEN)                                  \
 756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                                                                             \
 758             if (RExC_seen & REG_RECURSE_SEEN)                               \
 759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                                                                             \
 761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                                                                             \
 764             if (RExC_seen & REG_VERBARG_SEEN)                               \
 765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                                                                             \
 767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                                                                             \
 770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                                                                             \
 773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                                                                             \
 776             if (RExC_seen & REG_GOSTART_SEEN)                               \
 777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                                                                             \
 779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                                                                             \
 782             PerlIO_printf(Perl_debug_log,"\n");                             \
 783         });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787     PerlIO_printf(Perl_debug_log,                                    \
 788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790         (int)(depth)*2, "",                                          \
 791         (IV)((data)->pos_min),                                       \
 792         (IV)((data)->pos_delta),                                     \
 793         (UV)((data)->flags),                                         \
 794         (IV)((data)->whilem_c),                                      \
 795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796         is_inf ? "INF " : ""                                         \
 797     );                                                               \
 798     if ((data)->last_found)                                          \
 799         PerlIO_printf(Perl_debug_log,                                \
 800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802             SvPVX_const((data)->last_found),                         \
 803             (IV)((data)->last_end),                                  \
 804             (IV)((data)->last_start_min),                            \
 805             (IV)((data)->last_start_max),                            \
 806             ((data)->longest &&                                      \
 807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808             SvPVX_const((data)->longest_fixed),                      \
 809             (IV)((data)->offset_fixed),                              \
 810             ((data)->longest &&                                      \
 811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812             SvPVX_const((data)->longest_float),                      \
 813             (IV)((data)->offset_float_min),                          \
 814             (IV)((data)->offset_float_max)                           \
 815         );                                                           \
 816     PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825                     SSize_t *minlenp, int is_inf)
 826 {
 827     const STRLEN l = CHR_SVLEN(data->last_found);
 828     const STRLEN old_l = CHR_SVLEN(*data->longest);
 829     GET_RE_DEBUG_FLAGS_DECL;
 830
 831     PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834         SvSetMagicSV(*data->longest, data->last_found);
 835         if (*data->longest == data->longest_fixed) {
 836             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837             if (data->flags & SF_BEFORE_EOL)
 838                 data->flags
 839                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840             else
 841                 data->flags &= ~SF_FIX_BEFORE_EOL;
 842             data->minlen_fixed=minlenp;
 843             data->lookbehind_fixed=0;
 844         }
 845         else { /* *data->longest == data->longest_float */
 846             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847             data->offset_float_max = (l
 848                                       ? data->last_start_max
 849                                       : (data->pos_delta == SSize_t_MAX
 850                                          ? SSize_t_MAX
 851                                          : data->pos_min + data->pos_delta));
 852             if (is_inf
 853                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854                 data->offset_float_max = SSize_t_MAX;
 855             if (data->flags & SF_BEFORE_EOL)
 856                 data->flags
 857                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858             else
 859                 data->flags &= ~SF_FL_BEFORE_EOL;
 860             data->minlen_float=minlenp;
 861             data->lookbehind_float=0;
 862         }
 863     }
 864     SvCUR_set(data->last_found, 0);
 865     {
 866         SV * const sv = data->last_found;
 867         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869             if (mg)
 870                 mg->mg_len = 0;
 871         }
 872     }
 873     data->last_end = -1;
 874     data->flags &= ~SF_BEFORE_EOL;
 875     DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884     /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886     PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901      * in any way, so there's no point in using it */
 902
 903     UV start, end;
 904     bool ret;
 905
 906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908     assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911         return FALSE;
 912     }
 913
 914     /* See if the list consists solely of the range 0 - Infinity */
 915     invlist_iterinit(ssc->invlist);
 916     ret = invlist_iternext(ssc->invlist, &start, &end)
 917           && start == 0
 918           && end == UV_MAX;
 919
 920     invlist_iterfinish(ssc->invlist);
 921
 922     if (ret) {
 923         return TRUE;
 924     }
 925
 926     /* If e.g., both \w and \W are set, matches everything */
 927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928         int i;
 929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931                 return TRUE;
 932             }
 933         }
 934     }
 935
 936     return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943      * string, any code point, or any posix class under locale */
 944
 945     PERL_ARGS_ASSERT_SSC_INIT;
 946
 947     Zero(ssc, 1, regnode_ssc);
 948     set_ANYOF_SYNTHETIC(ssc);
 949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950     ssc_anything(ssc);
 951
 952     /* If any portion of the regex is to operate under locale rules,
 953      * initialization includes it.  The reason this isn't done for all regexes
 954      * is that the optimizer was written under the assumption that locale was
 955      * all-or-nothing.  Given the complexity and lack of documentation in the
 956      * optimizer, and that there are inadequate test cases for locale, many
 957      * parts of it may not work properly, it is safest to avoid locale unless
 958      * necessary. */
 959     if (RExC_contains_locale) {
 960         ANYOF_POSIXL_SETALL(ssc);
 961     }
 962     else {
 963         ANYOF_POSIXL_ZERO(ssc);
 964     }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969                               const regnode_ssc *ssc)
 970 {
 971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972      * to the list of code points matched, and locale posix classes; hence does
 973      * not check its flags) */
 974
 975     UV start, end;
 976     bool ret;
 977
 978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980     assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982     invlist_iterinit(ssc->invlist);
 983     ret = invlist_iternext(ssc->invlist, &start, &end)
 984           && start == 0
 985           && end == UV_MAX;
 986
 987     invlist_iterfinish(ssc->invlist);
 988
 989     if (! ret) {
 990         return FALSE;
 991     }
 992
 993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994         return FALSE;
 995     }
 996
 997     return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102
1103     return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero           ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125
1126     PERL_ARGS_ASSERT_SSC_AND;
1127
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294
1295     PERL_ARGS_ASSERT_SSC_OR;
1296
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399     SV* cp_list = _new_invlist(2);
1400
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431
1432     SV* invlist = invlist_clone(ssc->invlist);
1433
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489             AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504         SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509                             PL_colors[0], PL_colors[1],
1510                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511                             PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524         const U32 base = trie->states[ state ].trans.base;
1525
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538         if ( base ) {
1539             U32 ofs = 0;
1540
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564
1565             PerlIO_printf( Perl_debug_log, "]");
1566
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574             (int)word, (int)(trie->wordinfo[word].prev),
1575             (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588                          U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615             SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617             if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647                           U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716                May be the same as startbranch
1717   last       : Thing following the last branch.
1718                May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795           <ac>
1796           <ad>
1797           <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817         <foo>
1818         <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827     *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832         if (UTF) {                                                         \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1837             SvPOK_on(zlopp);                                               \
1838             SvUTF8_on(zlopp);                                              \
1839             av_push(revcharmap, zlopp);                                    \
1840         } else {                                                           \
1841             char ooooff = (char)val;                                           \
1842             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1843         }                                                                  \
1844         } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884         4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924         trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1933      ( ( base + charid >=  ucharcount                                   \
1934          && base + charid < ubound                                      \
1935          && state == trie->trans[ base - ucharcount + charid ].check    \
1936          && trie->trans[ base - ucharcount + charid ].next )            \
1937            ? trie->trans[ base - ucharcount + charid ].next             \
1938            : ( state==1 ? special : 0 )                                 \
1939       )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     if (!SvIOK(re_trie_maxbuff)) {
2011         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012     }
2013     DEBUG_TRIE_COMPILE_r({
2014         PerlIO_printf( Perl_debug_log,
2015           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016           (int)depth * 2 + 2, "",
2017           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019     });
2020
2021    /* Find the node we are going to overwrite */
2022     if ( first == startbranch && OP( last ) != BRANCH ) {
2023         /* whole branch chain */
2024         convert = first;
2025     } else {
2026         /* branch sub-chain */
2027         convert = NEXTOPER( first );
2028     }
2029
2030     /*  -- First loop and Setup --
2031
2032        We first traverse the branches and scan each word to determine if it
2033        contains widechars, and how many unique chars there are, this is
2034        important as we have to build a table with at least as many columns as we
2035        have unique chars.
2036
2037        We use an array of integers to represent the character codes 0..255
2038        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039        the native representation of the character value as the key and IV's for
2040        the coded index.
2041
2042        *TODO* If we keep track of how many times each character is used we can
2043        remap the columns so that the table compression later on is more
2044        efficient in terms of memory by ensuring the most common value is in the
2045        middle and the least common are on the outside.  IMO this would be better
2046        than a most to least common mapping as theres a decent chance the most
2047        common letter will share a node with the least common, meaning the node
2048        will not be compressible. With a middle is most common approach the worst
2049        case is when we have the least common nodes twice.
2050
2051      */
2052
2053     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054         regnode *noper = NEXTOPER( cur );
2055         const U8 *uc = (U8*)STRING( noper );
2056         const U8 *e  = uc + STR_LEN( noper );
2057         int foldlen = 0;
2058         U32 wordlen      = 0;         /* required init */
2059         STRLEN minchars = 0;
2060         STRLEN maxchars = 0;
2061         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062                                                bitmap?*/
2063
2064         if (OP(noper) == NOTHING) {
2065             regnode *noper_next= regnext(noper);
2066             if (noper_next != tail && OP(noper_next) == flags) {
2067                 noper = noper_next;
2068                 uc= (U8*)STRING(noper);
2069                 e= uc + STR_LEN(noper);
2070                 trie->minlen= STR_LEN(noper);
2071             } else {
2072                 trie->minlen= 0;
2073                 continue;
2074             }
2075         }
2076
2077         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079                                           regardless of encoding */
2080             if (OP( noper ) == EXACTFU_SS) {
2081                 /* false positives are ok, so just set this */
2082                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083             }
2084         }
2085         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086                                            branch */
2087             TRIE_CHARCOUNT(trie)++;
2088             TRIE_READ_CHAR;
2089
2090             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091              * is in effect.  Under /i, this character can match itself, or
2092              * anything that folds to it.  If not under /i, it can match just
2093              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094              * all fold to k, and all are single characters.   But some folds
2095              * expand to more than one character, so for example LATIN SMALL
2096              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097              * the string beginning at 'uc' is 'ffi', it could be matched by
2098              * three characters, or just by the one ligature character. (It
2099              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102              * match.)  The trie needs to know the minimum and maximum number
2103              * of characters that could match so that it can use size alone to
2104              * quickly reject many match attempts.  The max is simple: it is
2105              * the number of folded characters in this branch (since a fold is
2106              * never shorter than what folds to it. */
2107
2108             maxchars++;
2109
2110             /* And the min is equal to the max if not under /i (indicated by
2111              * 'folder' being NULL), or there are no multi-character folds.  If
2112              * there is a multi-character fold, the min is incremented just
2113              * once, for the character that folds to the sequence.  Each
2114              * character in the sequence needs to be added to the list below of
2115              * characters in the trie, but we count only the first towards the
2116              * min number of characters needed.  This is done through the
2117              * variable 'foldlen', which is returned by the macros that look
2118              * for these sequences as the number of bytes the sequence
2119              * occupies.  Each time through the loop, we decrement 'foldlen' by
2120              * how many bytes the current char occupies.  Only when it reaches
2121              * 0 do we increment 'minchars' or look for another multi-character
2122              * sequence. */
2123             if (folder == NULL) {
2124                 minchars++;
2125             }
2126             else if (foldlen > 0) {
2127                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128             }
2129             else {
2130                 minchars++;
2131
2132                 /* See if *uc is the beginning of a multi-character fold.  If
2133                  * so, we decrement the length remaining to look at, to account
2134                  * for the current character this iteration.  (We can use 'uc'
2135                  * instead of the fold returned by TRIE_READ_CHAR because for
2136                  * non-UTF, the latin1_safe macro is smart enough to account
2137                  * for all the unfolded characters, and because for UTF, the
2138                  * string will already have been folded earlier in the
2139                  * compilation process */
2140                 if (UTF) {
2141                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142                         foldlen -= UTF8SKIP(uc);
2143                     }
2144                 }
2145                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146                     foldlen--;
2147                 }
2148             }
2149
2150             /* The current character (and any potential folds) should be added
2151              * to the possible matching characters for this position in this
2152              * branch */
2153             if ( uvc < 256 ) {
2154                 if ( folder ) {
2155                     U8 folded= folder[ (U8) uvc ];
2156                     if ( !trie->charmap[ folded ] ) {
2157                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158                         TRIE_STORE_REVCHAR( folded );
2159                     }
2160                 }
2161                 if ( !trie->charmap[ uvc ] ) {
2162                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163                     TRIE_STORE_REVCHAR( uvc );
2164                 }
2165                 if ( set_bit ) {
2166                     /* store the codepoint in the bitmap, and its folded
2167                      * equivalent. */
2168                     TRIE_BITMAP_SET(trie, uvc);
2169
2170                     /* store the folded codepoint */
2171                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172
2173                     if ( !UTF ) {
2174                         /* store first byte of utf8 representation of
2175                            variant codepoints */
2176                         if (! UVCHR_IS_INVARIANT(uvc)) {
2177                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178                         }
2179                     }
2180                     set_bit = 0; /* We've done our bit :-) */
2181                 }
2182             } else {
2183
2184                 /* XXX We could come up with the list of code points that fold
2185                  * to this using PL_utf8_foldclosures, except not for
2186                  * multi-char folds, as there may be multiple combinations
2187                  * there that could work, which needs to wait until runtime to
2188                  * resolve (The comment about LIGATURE FFI above is such an
2189                  * example */
2190
2191                 SV** svpp;
2192                 if ( !widecharmap )
2193                     widecharmap = newHV();
2194
2195                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196
2197                 if ( !svpp )
2198                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199
2200                 if ( !SvTRUE( *svpp ) ) {
2201                     sv_setiv( *svpp, ++trie->uniquecharcount );
2202                     TRIE_STORE_REVCHAR(uvc);
2203                 }
2204             }
2205         } /* end loop through characters in this branch of the trie */
2206
2207         /* We take the min and max for this branch and combine to find the min
2208          * and max for all branches processed so far */
2209         if( cur == first ) {
2210             trie->minlen = minchars;
2211             trie->maxlen = maxchars;
2212         } else if (minchars < trie->minlen) {
2213             trie->minlen = minchars;
2214         } else if (maxchars > trie->maxlen) {
2215             trie->maxlen = maxchars;
2216         }
2217     } /* end first pass */
2218     DEBUG_TRIE_COMPILE_r(
2219         PerlIO_printf( Perl_debug_log,
2220                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221                 (int)depth * 2 + 2,"",
2222                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224                 (int)trie->minlen, (int)trie->maxlen )
2225     );
2226
2227     /*
2228         We now know what we are dealing with in terms of unique chars and
2229         string sizes so we can calculate how much memory a naive
2230         representation using a flat table  will take. If it's over a reasonable
2231         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232         conservative but potentially much slower representation using an array
2233         of lists.
2234
2235         At the end we convert both representations into the same compressed
2236         form that will be used in regexec.c for matching with. The latter
2237         is a form that cannot be used to construct with but has memory
2238         properties similar to the list form and access properties similar
2239         to the table form making it both suitable for fast searches and
2240         small enough that its feasable to store for the duration of a program.
2241
2242         See the comment in the code where the compressed table is produced
2243         inplace from the flat tabe representation for an explanation of how
2244         the compression works.
2245
2246     */
2247
2248
2249     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250     prev_states[1] = 0;
2251
2252     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253                                                     > SvIV(re_trie_maxbuff) )
2254     {
2255         /*
2256             Second Pass -- Array Of Lists Representation
2257
2258             Each state will be represented by a list of charid:state records
2259             (reg_trie_trans_le) the first such element holds the CUR and LEN
2260             points of the allocated array. (See defines above).
2261
2262             We build the initial structure using the lists, and then convert
2263             it into the compressed table form which allows faster lookups
2264             (but cant be modified once converted).
2265         */
2266
2267         STRLEN transcount = 1;
2268
2269         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270             "%*sCompiling trie using list compiler\n",
2271             (int)depth * 2 + 2, ""));
2272
2273         trie->states = (reg_trie_state *)
2274             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275                                   sizeof(reg_trie_state) );
2276         TRIE_LIST_NEW(1);
2277         next_alloc = 2;
2278
2279         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280
2281             regnode *noper   = NEXTOPER( cur );
2282             U8 *uc           = (U8*)STRING( noper );
2283             const U8 *e      = uc + STR_LEN( noper );
2284             U32 state        = 1;         /* required init */
2285             U16 charid       = 0;         /* sanity init */
2286             U32 wordlen      = 0;         /* required init */
2287
2288             if (OP(noper) == NOTHING) {
2289                 regnode *noper_next= regnext(noper);
2290                 if (noper_next != tail && OP(noper_next) == flags) {
2291                     noper = noper_next;
2292                     uc= (U8*)STRING(noper);
2293                     e= uc + STR_LEN(noper);
2294                 }
2295             }
2296
2297             if (OP(noper) != NOTHING) {
2298                 for ( ; uc < e ; uc += len ) {
2299
2300                     TRIE_READ_CHAR;
2301
2302                     if ( uvc < 256 ) {
2303                         charid = trie->charmap[ uvc ];
2304                     } else {
2305                         SV** const svpp = hv_fetch( widecharmap,
2306                                                     (char*)&uvc,
2307                                                     sizeof( UV ),
2308                                                     0);
2309                         if ( !svpp ) {
2310                             charid = 0;
2311                         } else {
2312                             charid=(U16)SvIV( *svpp );
2313                         }
2314                     }
2315                     /* charid is now 0 if we dont know the char read, or
2316                      * nonzero if we do */
2317                     if ( charid ) {
2318
2319                         U16 check;
2320                         U32 newstate = 0;
2321
2322                         charid--;
2323                         if ( !trie->states[ state ].trans.list ) {
2324                             TRIE_LIST_NEW( state );
2325                         }
2326                         for ( check = 1;
2327                               check <= TRIE_LIST_USED( state );
2328                               check++ )
2329                         {
2330                             if ( TRIE_LIST_ITEM( state, check ).forid
2331                                                                     == charid )
2332                             {
2333                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334                                 break;
2335                             }
2336                         }
2337                         if ( ! newstate ) {
2338                             newstate = next_alloc++;
2339                             prev_states[newstate] = state;
2340                             TRIE_LIST_PUSH( state, charid, newstate );
2341                             transcount++;
2342                         }
2343                         state = newstate;
2344                     } else {
2345                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346                     }
2347                 }
2348             }
2349             TRIE_HANDLE_WORD(state);
2350
2351         } /* end second pass */
2352
2353         /* next alloc is the NEXT state to be allocated */
2354         trie->statecount = next_alloc;
2355         trie->states = (reg_trie_state *)
2356             PerlMemShared_realloc( trie->states,
2357                                    next_alloc
2358                                    * sizeof(reg_trie_state) );
2359
2360         /* and now dump it out before we compress it */
2361         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362                                                          revcharmap, next_alloc,
2363                                                          depth+1)
2364         );
2365
2366         trie->trans = (reg_trie_trans *)
2367             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368         {
2369             U32 state;
2370             U32 tp = 0;
2371             U32 zp = 0;
2372
2373
2374             for( state=1 ; state < next_alloc ; state ++ ) {
2375                 U32 base=0;
2376
2377                 /*
2378                 DEBUG_TRIE_COMPILE_MORE_r(
2379                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380                 );
2381                 */
2382
2383                 if (trie->states[state].trans.list) {
2384                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385                     U16 maxid=minid;
2386                     U16 idx;
2387
2388                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390                         if ( forid < minid ) {
2391                             minid=forid;
2392                         } else if ( forid > maxid ) {
2393                             maxid=forid;
2394                         }
2395                     }
2396                     if ( transcount < tp + maxid - minid + 1) {
2397                         transcount *= 2;
2398                         trie->trans = (reg_trie_trans *)
2399                             PerlMemShared_realloc( trie->trans,
2400                                                      transcount
2401                                                      * sizeof(reg_trie_trans) );
2402                         Zero( trie->trans + (transcount / 2),
2403                               transcount / 2,
2404                               reg_trie_trans );
2405                     }
2406                     base = trie->uniquecharcount + tp - minid;
2407                     if ( maxid == minid ) {
2408                         U32 set = 0;
2409                         for ( ; zp < tp ; zp++ ) {
2410                             if ( ! trie->trans[ zp ].next ) {
2411                                 base = trie->uniquecharcount + zp - minid;
2412                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                                                                    1).newstate;
2414                                 trie->trans[ zp ].check = state;
2415                                 set = 1;
2416                                 break;
2417                             }
2418                         }
2419                         if ( !set ) {
2420                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                                                                    1).newstate;
2422                             trie->trans[ tp ].check = state;
2423                             tp++;
2424                             zp = tp;
2425                         }
2426                     } else {
2427                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428                             const U32 tid = base
2429                                            - trie->uniquecharcount
2430                                            + TRIE_LIST_ITEM( state, idx ).forid;
2431                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                                                                 idx ).newstate;
2433                             trie->trans[ tid ].check = state;
2434                         }
2435                         tp += ( maxid - minid + 1 );
2436                     }
2437                     Safefree(trie->states[ state ].trans.list);
2438                 }
2439                 /*
2440                 DEBUG_TRIE_COMPILE_MORE_r(
2441                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442                 );
2443                 */
2444                 trie->states[ state ].trans.base=base;
2445             }
2446             trie->lasttrans = tp + 1;
2447         }
2448     } else {
2449         /*
2450            Second Pass -- Flat Table Representation.
2451
2452            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453            each.  We know that we will need Charcount+1 trans at most to store
2454            the data (one row per char at worst case) So we preallocate both
2455            structures assuming worst case.
2456
2457            We then construct the trie using only the .next slots of the entry
2458            structs.
2459
2460            We use the .check field of the first entry of the node temporarily
2461            to make compression both faster and easier by keeping track of how
2462            many non zero fields are in the node.
2463
2464            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465            transition.
2466
2467            There are two terms at use here: state as a TRIE_NODEIDX() which is
2468            a number representing the first entry of the node, and state as a
2469            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471            if there are 2 entrys per node. eg:
2472
2473              A B       A B
2474           1. 2 4    1. 3 7
2475           2. 0 3    3. 0 5
2476           3. 0 0    5. 0 0
2477           4. 0 0    7. 0 0
2478
2479            The table is internally in the right hand, idx form. However as we
2480            also have to deal with the states array which is indexed by nodenum
2481            we have to use TRIE_NODENUM() to convert.
2482
2483         */
2484         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485             "%*sCompiling trie using table compiler\n",
2486             (int)depth * 2 + 2, ""));
2487
2488         trie->trans = (reg_trie_trans *)
2489             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490                                   * trie->uniquecharcount + 1,
2491                                   sizeof(reg_trie_trans) );
2492         trie->states = (reg_trie_state *)
2493             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494                                   sizeof(reg_trie_state) );
2495         next_alloc = trie->uniquecharcount + 1;
2496
2497
2498         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499
2500             regnode *noper   = NEXTOPER( cur );
2501             const U8 *uc     = (U8*)STRING( noper );
2502             const U8 *e      = uc + STR_LEN( noper );
2503
2504             U32 state        = 1;         /* required init */
2505
2506             U16 charid       = 0;         /* sanity init */
2507             U32 accept_state = 0;         /* sanity init */
2508
2509             U32 wordlen      = 0;         /* required init */
2510
2511             if (OP(noper) == NOTHING) {
2512                 regnode *noper_next= regnext(noper);
2513                 if (noper_next != tail && OP(noper_next) == flags) {
2514                     noper = noper_next;
2515                     uc= (U8*)STRING(noper);
2516                     e= uc + STR_LEN(noper);
2517                 }
2518             }
2519
2520             if ( OP(noper) != NOTHING ) {
2521                 for ( ; uc < e ; uc += len ) {
2522
2523                     TRIE_READ_CHAR;
2524
2525                     if ( uvc < 256 ) {
2526                         charid = trie->charmap[ uvc ];
2527                     } else {
2528                         SV* const * const svpp = hv_fetch( widecharmap,
2529                                                            (char*)&uvc,
2530                                                            sizeof( UV ),
2531                                                            0);
2532                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2533                     }
2534                     if ( charid ) {
2535                         charid--;
2536                         if ( !trie->trans[ state + charid ].next ) {
2537                             trie->trans[ state + charid ].next = next_alloc;
2538                             trie->trans[ state ].check++;
2539                             prev_states[TRIE_NODENUM(next_alloc)]
2540                                     = TRIE_NODENUM(state);
2541                             next_alloc += trie->uniquecharcount;
2542                         }
2543                         state = trie->trans[ state + charid ].next;
2544                     } else {
2545                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546                     }
2547                     /* charid is now 0 if we dont know the char read, or
2548                      * nonzero if we do */
2549                 }
2550             }
2551             accept_state = TRIE_NODENUM( state );
2552             TRIE_HANDLE_WORD(accept_state);
2553
2554         } /* end second pass */
2555
2556         /* and now dump it out before we compress it */
2557         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558                                                           revcharmap,
2559                                                           next_alloc, depth+1));
2560
2561         {
2562         /*
2563            * Inplace compress the table.*
2564
2565            For sparse data sets the table constructed by the trie algorithm will
2566            be mostly 0/FAIL transitions or to put it another way mostly empty.
2567            (Note that leaf nodes will not contain any transitions.)
2568
2569            This algorithm compresses the tables by eliminating most such
2570            transitions, at the cost of a modest bit of extra work during lookup:
2571
2572            - Each states[] entry contains a .base field which indicates the
2573            index in the state[] array wheres its transition data is stored.
2574
2575            - If .base is 0 there are no valid transitions from that node.
2576
2577            - If .base is nonzero then charid is added to it to find an entry in
2578            the trans array.
2579
2580            -If trans[states[state].base+charid].check!=state then the
2581            transition is taken to be a 0/Fail transition. Thus if there are fail
2582            transitions at the front of the node then the .base offset will point
2583            somewhere inside the previous nodes data (or maybe even into a node
2584            even earlier), but the .check field determines if the transition is
2585            valid.
2586
2587            XXX - wrong maybe?
2588            The following process inplace converts the table to the compressed
2589            table: We first do not compress the root node 1,and mark all its
2590            .check pointers as 1 and set its .base pointer as 1 as well. This
2591            allows us to do a DFA construction from the compressed table later,
2592            and ensures that any .base pointers we calculate later are greater
2593            than 0.
2594
2595            - We set 'pos' to indicate the first entry of the second node.
2596
2597            - We then iterate over the columns of the node, finding the first and
2598            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599            and set the .check pointers accordingly, and advance pos
2600            appropriately and repreat for the next node. Note that when we copy
2601            the next pointers we have to convert them from the original
2602            NODEIDX form to NODENUM form as the former is not valid post
2603            compression.
2604
2605            - If a node has no transitions used we mark its base as 0 and do not
2606            advance the pos pointer.
2607
2608            - If a node only has one transition we use a second pointer into the
2609            structure to fill in allocated fail transitions from other states.
2610            This pointer is independent of the main pointer and scans forward
2611            looking for null transitions that are allocated to a state. When it
2612            finds one it writes the single transition into the "hole".  If the
2613            pointer doesnt find one the single transition is appended as normal.
2614
2615            - Once compressed we can Renew/realloc the structures to release the
2616            excess space.
2617
2618            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619            specifically Fig 3.47 and the associated pseudocode.
2620
2621            demq
2622         */
2623         const U32 laststate = TRIE_NODENUM( next_alloc );
2624         U32 state, charid;
2625         U32 pos = 0, zp=0;
2626         trie->statecount = laststate;
2627
2628         for ( state = 1 ; state < laststate ; state++ ) {
2629             U8 flag = 0;
2630             const U32 stateidx = TRIE_NODEIDX( state );
2631             const U32 o_used = trie->trans[ stateidx ].check;
2632             U32 used = trie->trans[ stateidx ].check;
2633             trie->trans[ stateidx ].check = 0;
2634
2635             for ( charid = 0;
2636                   used && charid < trie->uniquecharcount;
2637                   charid++ )
2638             {
2639                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640                     if ( trie->trans[ stateidx + charid ].next ) {
2641                         if (o_used == 1) {
2642                             for ( ; zp < pos ; zp++ ) {
2643                                 if ( ! trie->trans[ zp ].next ) {
2644                                     break;
2645                                 }
2646                             }
2647                             trie->states[ state ].trans.base
2648                                                     = zp
2649                                                       + trie->uniquecharcount
2650                                                       - charid ;
2651                             trie->trans[ zp ].next
2652                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                                                              + charid ].next );
2654                             trie->trans[ zp ].check = state;
2655                             if ( ++zp > pos ) pos = zp;
2656                             break;
2657                         }
2658                         used--;
2659                     }
2660                     if ( !flag ) {
2661                         flag = 1;
2662                         trie->states[ state ].trans.base
2663                                        = pos + trie->uniquecharcount - charid ;
2664                     }
2665                     trie->trans[ pos ].next
2666                         = SAFE_TRIE_NODENUM(
2667                                        trie->trans[ stateidx + charid ].next );
2668                     trie->trans[ pos ].check = state;
2669                     pos++;
2670                 }
2671             }
2672         }
2673         trie->lasttrans = pos + 1;
2674         trie->states = (reg_trie_state *)
2675             PerlMemShared_realloc( trie->states, laststate
2676                                    * sizeof(reg_trie_state) );
2677         DEBUG_TRIE_COMPILE_MORE_r(
2678             PerlIO_printf( Perl_debug_log,
2679                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680                 (int)depth * 2 + 2,"",
2681                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682                        + 1 ),
2683                 (IV)next_alloc,
2684                 (IV)pos,
2685                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686             );
2687
2688         } /* end table compress */
2689     }
2690     DEBUG_TRIE_COMPILE_MORE_r(
2691             PerlIO_printf(Perl_debug_log,
2692                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693                 (int)depth * 2 + 2, "",
2694                 (UV)trie->statecount,
2695                 (UV)trie->lasttrans)
2696     );
2697     /* resize the trans array to remove unused space */
2698     trie->trans = (reg_trie_trans *)
2699         PerlMemShared_realloc( trie->trans, trie->lasttrans
2700                                * sizeof(reg_trie_trans) );
2701
2702     {   /* Modify the program and insert the new TRIE node */
2703         U8 nodetype =(U8)(flags & 0xFF);
2704         char *str=NULL;
2705
2706 #ifdef DEBUGGING
2707         regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709
2710         U32 mjd_offset = 0;
2711         U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714         /*
2715            This means we convert either the first branch or the first Exact,
2716            depending on whether the thing following (in 'last') is a branch
2717            or not and whther first is the startbranch (ie is it a sub part of
2718            the alternation or is it the whole thing.)
2719            Assuming its a sub part we convert the EXACT otherwise we convert
2720            the whole branch sequence, including the first.
2721          */
2722         /* Find the node we are going to overwrite */
2723         if ( first != startbranch || OP( last ) == BRANCH ) {
2724             /* branch sub-chain */
2725             NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727             DEBUG_r({
2728                 mjd_offset= Node_Offset((convert));
2729                 mjd_nodelen= Node_Length((convert));
2730             });
2731 #endif
2732             /* whole branch chain */
2733         }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735         else {
2736             DEBUG_r({
2737                 const  regnode *nop = NEXTOPER( convert );
2738                 mjd_offset= Node_Offset((nop));
2739                 mjd_nodelen= Node_Length((nop));
2740             });
2741         }
2742         DEBUG_OPTIMISE_r(
2743             PerlIO_printf(Perl_debug_log,
2744                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745                 (int)depth * 2 + 2, "",
2746                 (UV)mjd_offset, (UV)mjd_nodelen)
2747         );
2748 #endif
2749         /* But first we check to see if there is a common prefix we can
2750            split out as an EXACT and put in front of the TRIE node.  */
2751         trie->startstate= 1;
2752         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753             U32 state;
2754             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755                 U32 ofs = 0;
2756                 I32 idx = -1;
2757                 U32 count = 0;
2758                 const U32 base = trie->states[ state ].trans.base;
2759
2760                 if ( trie->states[state].wordnum )
2761                         count = 1;
2762
2763                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2765                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767                     {
2768                         if ( ++count > 1 ) {
2769                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2770                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771                             if ( state == 1 ) break;
2772                             if ( count == 2 ) {
2773                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774                                 DEBUG_OPTIMISE_r(
2775                                     PerlIO_printf(Perl_debug_log,
2776                                         "%*sNew Start State=%"UVuf" Class: [",
2777                                         (int)depth * 2 + 2, "",
2778                                         (UV)state));
2779                                 if (idx >= 0) {
2780                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782
2783                                     TRIE_BITMAP_SET(trie,*ch);
2784                                     if ( folder )
2785                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786                                     DEBUG_OPTIMISE_r(
2787                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788                                     );
2789                                 }
2790                             }
2791                             TRIE_BITMAP_SET(trie,*ch);
2792                             if ( folder )
2793                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795                         }
2796                         idx = ofs;
2797                     }
2798                 }
2799                 if ( count == 1 ) {
2800                     SV **tmp = av_fetch( revcharmap, idx, 0);
2801                     STRLEN len;
2802                     char *ch = SvPV( *tmp, len );
2803                     DEBUG_OPTIMISE_r({
2804                         SV *sv=sv_newmortal();
2805                         PerlIO_printf( Perl_debug_log,
2806                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807                             (int)depth * 2 + 2, "",
2808                             (UV)state, (UV)idx,
2809                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810                                 PL_colors[0], PL_colors[1],
2811                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812                                 PERL_PV_ESCAPE_FIRSTCHAR
2813                             )
2814                         );
2815                     });
2816                     if ( state==1 ) {
2817                         OP( convert ) = nodetype;
2818                         str=STRING(convert);
2819                         STR_LEN(convert)=0;
2820                     }
2821                     STR_LEN(convert) += len;
2822                     while (len--)
2823                         *str++ = *ch++;
2824                 } else {
2825 #ifdef DEBUGGING
2826                     if (state>1)
2827                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829                     break;
2830                 }
2831             }
2832             trie->prefixlen = (state-1);
2833             if (str) {
2834                 regnode *n = convert+NODE_SZ_STR(convert);
2835                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836                 trie->startstate = state;
2837                 trie->minlen -= (state - 1);
2838                 trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840                /* At least the UNICOS C compiler choked on this
2841                 * being argument to DEBUG_r(), so let's just have
2842                 * it right here. */
2843                if (
2844 #ifdef PERL_EXT_RE_BUILD
2845                    1
2846 #else
2847                    DEBUG_r_TEST
2848 #endif
2849                    ) {
2850                    regnode *fix = convert;
2851                    U32 word = trie->wordcount;
2852                    mjd_nodelen++;
2853                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854                    while( ++fix < n ) {
2855                        Set_Node_Offset_Length(fix, 0, 0);
2856                    }
2857                    while (word--) {
2858                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2859                        if (tmp) {
2860                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862                            else
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864                        }
2865                    }
2866                }
2867 #endif
2868                 if (trie->maxlen) {
2869                     convert = n;
2870                 } else {
2871                     NEXT_OFF(convert) = (U16)(tail - convert);
2872                     DEBUG_r(optimize= n);
2873                 }
2874             }
2875         }
2876         if (!jumper)
2877             jumper = last;
2878         if ( trie->maxlen ) {
2879             NEXT_OFF( convert ) = (U16)(tail - convert);
2880             ARG_SET( convert, data_slot );
2881             /* Store the offset to the first unabsorbed branch in
2882                jump[0], which is otherwise unused by the jump logic.
2883                We use this when dumping a trie and during optimisation. */
2884             if (trie->jump)
2885                 trie->jump[0] = (U16)(nextbranch - convert);
2886
2887             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888              *   and there is a bitmap
2889              *   and the first "jump target" node we found leaves enough room
2890              * then convert the TRIE node into a TRIEC node, with the bitmap
2891              * embedded inline in the opcode - this is hypothetically faster.
2892              */
2893             if ( !trie->states[trie->startstate].wordnum
2894                  && trie->bitmap
2895                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896             {
2897                 OP( convert ) = TRIEC;
2898                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899                 PerlMemShared_free(trie->bitmap);
2900                 trie->bitmap= NULL;
2901             } else
2902                 OP( convert ) = TRIE;
2903
2904             /* store the type in the flags */
2905             convert->flags = nodetype;
2906             DEBUG_r({
2907             optimize = convert
2908                       + NODE_STEP_REGNODE
2909                       + regarglen[ OP( convert ) ];
2910             });
2911             /* XXX We really should free up the resource in trie now,
2912                    as we won't use them - (which resources?) dmq */
2913         }
2914         /* needed for dumping*/
2915         DEBUG_r(if (optimize) {
2916             regnode *opt = convert;
2917
2918             while ( ++opt < optimize) {
2919                 Set_Node_Offset_Length(opt,0,0);
2920             }
2921             /*
2922                 Try to clean up some of the debris left after the
2923                 optimisation.
2924              */
2925             while( optimize < jumper ) {
2926                 mjd_nodelen += Node_Length((optimize));
2927                 OP( optimize ) = OPTIMIZED;
2928                 Set_Node_Offset_Length(optimize,0,0);
2929                 optimize++;
2930             }
2931             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932         });
2933     } /* end node insert */
2934
2935     /*  Finish populating the prev field of the wordinfo array.  Walk back
2936      *  from each accept state until we find another accept state, and if
2937      *  so, point the first word's .prev field at the second word. If the
2938      *  second already has a .prev field set, stop now. This will be the
2939      *  case either if we've already processed that word's accept state,
2940      *  or that state had multiple words, and the overspill words were
2941      *  already linked up earlier.
2942      */
2943     {
2944         U16 word;
2945         U32 state;
2946         U16 prev;
2947
2948         for (word=1; word <= trie->wordcount; word++) {
2949             prev = 0;
2950             if (trie->wordinfo[word].prev)
2951                 continue;
2952             state = trie->wordinfo[word].accept;
2953             while (state) {
2954                 state = prev_states[state];
2955                 if (!state)
2956                     break;
2957                 prev = trie->states[state].wordnum;
2958                 if (prev)
2959                     break;
2960             }
2961             trie->wordinfo[word].prev = prev;
2962         }
2963         Safefree(prev_states);
2964     }
2965
2966
2967     /* and now dump out the compressed format */
2968     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2969
2970     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2971 #ifdef DEBUGGING
2972     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2973     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2974 #else
2975     SvREFCNT_dec_NN(revcharmap);
2976 #endif
2977     return trie->jump
2978            ? MADE_JUMP_TRIE
2979            : trie->startstate>1
2980              ? MADE_EXACT_TRIE
2981              : MADE_TRIE;
2982 }
2983
2984 STATIC void
2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2986 {
2987 /* The Trie is constructed and compressed now so we can build a fail array if
2988  * it's needed
2989
2990    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2991    3.32 in the
2992    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2993    Ullman 1985/88
2994    ISBN 0-201-10088-6
2995
2996    We find the fail state for each state in the trie, this state is the longest
2997    proper suffix of the current state's 'word' that is also a proper prefix of
2998    another word in our trie. State 1 represents the word '' and is thus the
2999    default fail state. This allows the DFA not to have to restart after its
3000    tried and failed a word at a given point, it simply continues as though it
3001    had been matching the other word in the first place.
3002    Consider
3003       'abcdgu'=~/abcdefg|cdgu/
3004    When we get to 'd' we are still matching the first word, we would encounter
3005    'g' which would fail, which would bring us to the state representing 'd' in
3006    the second word where we would try 'g' and succeed, proceeding to match
3007    'cdgu'.
3008  */
3009  /* add a fail transition */
3010     const U32 trie_offset = ARG(source);
3011     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3012     U32 *q;
3013     const U32 ucharcount = trie->uniquecharcount;
3014     const U32 numstates = trie->statecount;
3015     const U32 ubound = trie->lasttrans + ucharcount;
3016     U32 q_read = 0;
3017     U32 q_write = 0;
3018     U32 charid;
3019     U32 base = trie->states[ 1 ].trans.base;
3020     U32 *fail;
3021     reg_ac_data *aho;
3022     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3023     GET_RE_DEBUG_FLAGS_DECL;
3024
3025     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3026 #ifndef DEBUGGING
3027     PERL_UNUSED_ARG(depth);
3028 #endif
3029
3030
3031     ARG_SET( stclass, data_slot );
3032     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3033     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3034     aho->trie=trie_offset;
3035     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3036     Copy( trie->states, aho->states, numstates, reg_trie_state );
3037     Newxz( q, numstates, U32);
3038     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3039     aho->refcount = 1;
3040     fail = aho->fail;
3041     /* initialize fail[0..1] to be 1 so that we always have
3042        a valid final fail state */
3043     fail[ 0 ] = fail[ 1 ] = 1;
3044
3045     for ( charid = 0; charid < ucharcount ; charid++ ) {
3046         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3047         if ( newstate ) {
3048             q[ q_write ] = newstate;
3049             /* set to point at the root */
3050             fail[ q[ q_write++ ] ]=1;
3051         }
3052     }
3053     while ( q_read < q_write) {
3054         const U32 cur = q[ q_read++ % numstates ];
3055         base = trie->states[ cur ].trans.base;
3056
3057         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3058             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3059             if (ch_state) {
3060                 U32 fail_state = cur;
3061                 U32 fail_base;
3062                 do {
3063                     fail_state = fail[ fail_state ];
3064                     fail_base = aho->states[ fail_state ].trans.base;
3065                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3066
3067                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3068                 fail[ ch_state ] = fail_state;
3069                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3070                 {
3071                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3072                 }
3073                 q[ q_write++ % numstates] = ch_state;
3074             }
3075         }
3076     }
3077     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3078        when we fail in state 1, this allows us to use the
3079        charclass scan to find a valid start char. This is based on the principle
3080        that theres a good chance the string being searched contains lots of stuff
3081        that cant be a start char.
3082      */
3083     fail[ 0 ] = fail[ 1 ] = 0;
3084     DEBUG_TRIE_COMPILE_r({
3085         PerlIO_printf(Perl_debug_log,
3086                       "%*sStclass Failtable (%"UVuf" states): 0",
3087                       (int)(depth * 2), "", (UV)numstates
3088         );
3089         for( q_read=1; q_read<numstates; q_read++ ) {
3090             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3091         }
3092         PerlIO_printf(Perl_debug_log, "\n");
3093     });
3094     Safefree(q);
3095     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3096 }
3097
3098
3099 #define DEBUG_PEEP(str,scan,depth) \
3100     DEBUG_OPTIMISE_r({if (scan){ \
3101        SV * const mysv=sv_newmortal(); \
3102        regnode *Next = regnext(scan); \
3103        regprop(RExC_rx, mysv, scan, NULL); \
3104        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3105        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3107    }});
3108
3109
3110 /* The below joins as many adjacent EXACTish nodes as possible into a single
3111  * one.  The regop may be changed if the node(s) contain certain sequences that
3112  * require special handling.  The joining is only done if:
3113  * 1) there is room in the current conglomerated node to entirely contain the
3114  *    next one.
3115  * 2) they are the exact same node type
3116  *
3117  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3118  * these get optimized out
3119  *
3120  * If a node is to match under /i (folded), the number of characters it matches
3121  * can be different than its character length if it contains a multi-character
3122  * fold.  *min_subtract is set to the total delta number of characters of the
3123  * input nodes.
3124  *
3125  * And *unfolded_multi_char is set to indicate whether or not the node contains
3126  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3127  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128  * SMALL LETTER SHARP S, as only if the target string being matched against
3129  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3131  * whose components are all above the Latin1 range are not run-time locale
3132  * dependent, and have already been folded by the time this function is
3133  * called.)
3134  *
3135  * This is as good a place as any to discuss the design of handling these
3136  * multi-character fold sequences.  It's been wrong in Perl for a very long
3137  * time.  There are three code points in Unicode whose multi-character folds
3138  * were long ago discovered to mess things up.  The previous designs for
3139  * dealing with these involved assigning a special node for them.  This
3140  * approach doesn't always work, as evidenced by this example:
3141  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3142  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3143  * would match just the \xDF, it won't be able to handle the case where a
3144  * successful match would have to cross the node's boundary.  The new approach
3145  * that hopefully generally solves the problem generates an EXACTFU_SS node
3146  * that is "sss" in this case.
3147  *
3148  * It turns out that there are problems with all multi-character folds, and not
3149  * just these three.  Now the code is general, for all such cases.  The
3150  * approach taken is:
3151  * 1)   This routine examines each EXACTFish node that could contain multi-
3152  *      character folded sequences.  Since a single character can fold into
3153  *      such a sequence, the minimum match length for this node is less than
3154  *      the number of characters in the node.  This routine returns in
3155  *      *min_subtract how many characters to subtract from the the actual
3156  *      length of the string to get a real minimum match length; it is 0 if
3157  *      there are no multi-char foldeds.  This delta is used by the caller to
3158  *      adjust the min length of the match, and the delta between min and max,
3159  *      so that the optimizer doesn't reject these possibilities based on size
3160  *      constraints.
3161  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3162  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3163  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3164  *      there is a possible fold length change.  That means that a regular
3165  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166  *      with length changes, and so can be processed faster.  regexec.c takes
3167  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3168  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169  *      known until runtime).  This saves effort in regex matching.  However,
3170  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3171  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3173  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3174  *      possibilities for the non-UTF8 patterns are quite simple, except for
3175  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3176  *      members of a fold-pair, and arrays are set up for all of them so that
3177  *      the other member of the pair can be found quickly.  Code elsewhere in
3178  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3180  *      described in the next item.
3181  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3182  *      validity of the fold won't be known until runtime, and so must remain
3183  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3184  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3185  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3186  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187  *      The reason this is a problem is that the optimizer part of regexec.c
3188  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189  *      that a character in the pattern corresponds to at most a single
3190  *      character in the target string.  (And I do mean character, and not byte
3191  *      here, unlike other parts of the documentation that have never been
3192  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3193  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3195  *      nodes, violate the assumption, and they are the only instances where it
3196  *      is violated.  I'm reluctant to try to change the assumption, as the
3197  *      code involved is impenetrable to me (khw), so instead the code here
3198  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3199  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200  *      boolean indicating whether or not the node contains such a fold.  When
3201  *      it is true, the caller sets a flag that later causes the optimizer in
3202  *      this file to not set values for the floating and fixed string lengths,
3203  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3204  *      assumption.  Thus, there is no optimization based on string lengths for
3205  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3207  *      assumption is wrong only in these cases is that all other non-UTF-8
3208  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3210  *      EXACTF nodes because we don't know at compile time if it actually
3211  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3212  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3213  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3214  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216  *      string would require the pattern to be forced into UTF-8, the overhead
3217  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3218  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219  *      locale.)
3220  *
3221  *      Similarly, the code that generates tries doesn't currently handle
3222  *      not-already-folded multi-char folds, and it looks like a pain to change
3223  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3224  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3225  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3226  *      using /iaa matching will be doing so almost entirely with ASCII
3227  *      strings, so this should rarely be encountered in practice */
3228
3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3230     if (PL_regkind[OP(scan)] == EXACT) \
3231         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3232
3233 STATIC U32
3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3235                    UV *min_subtract, bool *unfolded_multi_char,
3236                    U32 flags,regnode *val, U32 depth)
3237 {
3238     /* Merge several consecutive EXACTish nodes into one. */
3239     regnode *n = regnext(scan);
3240     U32 stringok = 1;
3241     regnode *next = scan + NODE_SZ_STR(scan);
3242     U32 merged = 0;
3243     U32 stopnow = 0;
3244 #ifdef DEBUGGING
3245     regnode *stop = scan;
3246     GET_RE_DEBUG_FLAGS_DECL;
3247 #else
3248     PERL_UNUSED_ARG(depth);
3249 #endif
3250
3251     PERL_ARGS_ASSERT_JOIN_EXACT;
3252 #ifndef EXPERIMENTAL_INPLACESCAN
3253     PERL_UNUSED_ARG(flags);
3254     PERL_UNUSED_ARG(val);
3255 #endif
3256     DEBUG_PEEP("join",scan,depth);
3257
3258     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3259      * EXACT ones that are mergeable to the current one. */
3260     while (n
3261            && (PL_regkind[OP(n)] == NOTHING
3262                || (stringok && OP(n) == OP(scan)))
3263            && NEXT_OFF(n)
3264            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265     {
3266
3267         if (OP(n) == TAIL || n > next)
3268             stringok = 0;
3269         if (PL_regkind[OP(n)] == NOTHING) {
3270             DEBUG_PEEP("skip:",n,depth);
3271             NEXT_OFF(scan) += NEXT_OFF(n);
3272             next = n + NODE_STEP_REGNODE;
3273 #ifdef DEBUGGING
3274             if (stringok)
3275                 stop = n;
3276 #endif
3277             n = regnext(n);
3278         }
3279         else if (stringok) {
3280             const unsigned int oldl = STR_LEN(scan);
3281             regnode * const nnext = regnext(n);
3282
3283             /* XXX I (khw) kind of doubt that this works on platforms (should
3284              * Perl ever run on one) where U8_MAX is above 255 because of lots
3285              * of other assumptions */
3286             /* Don't join if the sum can't fit into a single node */
3287             if (oldl + STR_LEN(n) > U8_MAX)
3288                 break;
3289
3290             DEBUG_PEEP("merg",n,depth);
3291             merged++;
3292
3293             NEXT_OFF(scan) += NEXT_OFF(n);
3294             STR_LEN(scan) += STR_LEN(n);
3295             next = n + NODE_SZ_STR(n);
3296             /* Now we can overwrite *n : */
3297             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298 #ifdef DEBUGGING
3299             stop = next - 1;
3300 #endif
3301             n = nnext;
3302             if (stopnow) break;
3303         }
3304
3305 #ifdef EXPERIMENTAL_INPLACESCAN
3306         if (flags && !NEXT_OFF(n)) {
3307             DEBUG_PEEP("atch", val, depth);
3308             if (reg_off_by_arg[OP(n)]) {
3309                 ARG_SET(n, val - n);
3310             }
3311             else {
3312                 NEXT_OFF(n) = val - n;
3313             }
3314             stopnow = 1;
3315         }
3316 #endif
3317     }
3318
3319     *min_subtract = 0;
3320     *unfolded_multi_char = FALSE;
3321
3322     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3323      * can now analyze for sequences of problematic code points.  (Prior to
3324      * this final joining, sequences could have been split over boundaries, and
3325      * hence missed).  The sequences only happen in folding, hence for any
3326      * non-EXACT EXACTish node */
3327     if (OP(scan) != EXACT) {
3328         U8* s0 = (U8*) STRING(scan);
3329         U8* s = s0;
3330         U8* s_end = s0 + STR_LEN(scan);
3331
3332         int total_count_delta = 0;  /* Total delta number of characters that
3333                                        multi-char folds expand to */
3334
3335         /* One pass is made over the node's string looking for all the
3336          * possibilities.  To avoid some tests in the loop, there are two main
3337          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338          * non-UTF-8 */
3339         if (UTF) {
3340             U8* folded = NULL;
3341
3342             if (OP(scan) == EXACTFL) {
3343                 U8 *d;
3344
3345                 /* An EXACTFL node would already have been changed to another
3346                  * node type unless there is at least one character in it that
3347                  * is problematic; likely a character whose fold definition
3348                  * won't be known until runtime, and so has yet to be folded.
3349                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3350                  * to handle the UTF-8 case, we need to create a temporary
3351                  * folded copy using UTF-8 locale rules in order to analyze it.
3352                  * This is because our macros that look to see if a sequence is
3353                  * a multi-char fold assume everything is folded (otherwise the
3354                  * tests in those macros would be too complicated and slow).
3355                  * Note that here, the non-problematic folds will have already
3356                  * been done, so we can just copy such characters.  We actually
3357                  * don't completely fold the EXACTFL string.  We skip the
3358                  * unfolded multi-char folds, as that would just create work
3359                  * below to figure out the size they already are */
3360
3361                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362                 d = folded;
3363                 while (s < s_end) {
3364                     STRLEN s_len = UTF8SKIP(s);
3365                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366                         Copy(s, d, s_len, U8);
3367                         d += s_len;
3368                     }
3369                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3370                         *unfolded_multi_char = TRUE;
3371                         Copy(s, d, s_len, U8);
3372                         d += s_len;
3373                     }
3374                     else if (isASCII(*s)) {
3375                         *(d++) = toFOLD(*s);
3376                     }
3377                     else {
3378                         STRLEN len;
3379                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380                         d += len;
3381                     }
3382                     s += s_len;
3383                 }
3384
3385                 /* Point the remainder of the routine to look at our temporary
3386                  * folded copy */
3387                 s = folded;
3388                 s_end = d;
3389             } /* End of creating folded copy of EXACTFL string */
3390
3391             /* Examine the string for a multi-character fold sequence.  UTF-8
3392              * patterns have all characters pre-folded by the time this code is
3393              * executed */
3394             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395                                      length sequence we are looking for is 2 */
3396             {
3397                 int count = 0;  /* How many characters in a multi-char fold */
3398                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3399                 if (! len) {    /* Not a multi-char fold: get next char */
3400                     s += UTF8SKIP(s);
3401                     continue;
3402                 }
3403
3404                 /* Nodes with 'ss' require special handling, except for
3405                  * EXACTFA-ish for which there is no multi-char fold to this */
3406                 if (len == 2 && *s == 's' && *(s+1) == 's'
3407                     && OP(scan) != EXACTFA
3408                     && OP(scan) != EXACTFA_NO_TRIE)
3409                 {
3410                     count = 2;
3411                     if (OP(scan) != EXACTFL) {
3412                         OP(scan) = EXACTFU_SS;
3413                     }
3414                     s += 2;
3415                 }
3416                 else { /* Here is a generic multi-char fold. */
3417                     U8* multi_end  = s + len;
3418
3419                     /* Count how many characters in it.  In the case of /aa, no
3420                      * folds which contain ASCII code points are allowed, so
3421                      * check for those, and skip if found. */
3422                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3423                         count = utf8_length(s, multi_end);
3424                         s = multi_end;
3425                     }
3426                     else {
3427                         while (s < multi_end) {
3428                             if (isASCII(*s)) {
3429                                 s++;
3430                                 goto next_iteration;
3431                             }
3432                             else {
3433                                 s += UTF8SKIP(s);
3434                             }
3435                             count++;
3436                         }
3437                     }
3438                 }
3439
3440                 /* The delta is how long the sequence is minus 1 (1 is how long
3441                  * the character that folds to the sequence is) */
3442                 total_count_delta += count - 1;
3443               next_iteration: ;
3444             }
3445
3446             /* We created a temporary folded copy of the string in EXACTFL
3447              * nodes.  Therefore we need to be sure it doesn't go below zero,
3448              * as the real string could be shorter */
3449             if (OP(scan) == EXACTFL) {
3450                 int total_chars = utf8_length((U8*) STRING(scan),
3451                                            (U8*) STRING(scan) + STR_LEN(scan));
3452                 if (total_count_delta > total_chars) {
3453                     total_count_delta = total_chars;
3454                 }
3455             }
3456
3457             *min_subtract += total_count_delta;
3458             Safefree(folded);
3459         }
3460         else if (OP(scan) == EXACTFA) {
3461
3462             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3463              * fold to the ASCII range (and there are no existing ones in the
3464              * upper latin1 range).  But, as outlined in the comments preceding
3465              * this function, we need to flag any occurrences of the sharp s.
3466              * This character forbids trie formation (because of added
3467              * complexity) */
3468             while (s < s_end) {
3469                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3470                     OP(scan) = EXACTFA_NO_TRIE;
3471                     *unfolded_multi_char = TRUE;
3472                     break;
3473                 }
3474                 s++;
3475                 continue;
3476             }
3477         }
3478         else {
3479
3480             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3481              * folds that are all Latin1.  As explained in the comments
3482              * preceding this function, we look also for the sharp s in EXACTF
3483              * and EXACTFL nodes; it can be in the final position.  Otherwise
3484              * we can stop looking 1 byte earlier because have to find at least
3485              * two characters for a multi-fold */
3486             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3487                               ? s_end
3488                               : s_end -1;
3489
3490             while (s < upper) {
3491                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3492                 if (! len) {    /* Not a multi-char fold. */
3493                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3494                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3495                     {
3496                         *unfolded_multi_char = TRUE;
3497                     }
3498                     s++;
3499                     continue;
3500                 }
3501
3502                 if (len == 2
3503                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3504                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3505                 {
3506
3507                     /* EXACTF nodes need to know that the minimum length
3508                      * changed so that a sharp s in the string can match this
3509                      * ss in the pattern, but they remain EXACTF nodes, as they
3510                      * won't match this unless the target string is is UTF-8,
3511                      * which we don't know until runtime.  EXACTFL nodes can't
3512                      * transform into EXACTFU nodes */
3513                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3514                         OP(scan) = EXACTFU_SS;
3515                     }
3516                 }
3517
3518                 *min_subtract += len - 1;
3519                 s += len;
3520             }
3521         }
3522     }
3523
3524 #ifdef DEBUGGING
3525     /* Allow dumping but overwriting the collection of skipped
3526      * ops and/or strings with fake optimized ops */
3527     n = scan + NODE_SZ_STR(scan);
3528     while (n <= stop) {
3529         OP(n) = OPTIMIZED;
3530         FLAGS(n) = 0;
3531         NEXT_OFF(n) = 0;
3532         n++;
3533     }
3534 #endif
3535     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3536     return stopnow;
3537 }
3538
3539 /* REx optimizer.  Converts nodes into quicker variants "in place".
3540    Finds fixed substrings.  */
3541
3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3543    to the position after last scanned or to NULL. */
3544
3545 #define INIT_AND_WITHP \
3546     assert(!and_withp); \
3547     Newx(and_withp,1, regnode_ssc); \
3548     SAVEFREEPV(and_withp)
3549
3550 /* this is a chain of data about sub patterns we are processing that
3551    need to be handled separately/specially in study_chunk. Its so
3552    we can simulate recursion without losing state.  */
3553 struct scan_frame;
3554 typedef struct scan_frame {
3555     regnode *last;  /* last node to process in this frame */
3556     regnode *next;  /* next node to process when last is reached */
3557     struct scan_frame *prev; /*previous frame*/
3558     U32 prev_recursed_depth;
3559     I32 stop; /* what stopparen do we use */
3560 } scan_frame;
3561
3562
3563 STATIC SSize_t
3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3565                         SSize_t *minlenp, SSize_t *deltap,
3566                         regnode *last,
3567                         scan_data_t *data,
3568                         I32 stopparen,
3569                         U32 recursed_depth,
3570                         regnode_ssc *and_withp,
3571                         U32 flags, U32 depth)
3572                         /* scanp: Start here (read-write). */
3573                         /* deltap: Write maxlen-minlen here. */
3574                         /* last: Stop before this one. */
3575                         /* data: string data about the pattern */
3576                         /* stopparen: treat close N as END */
3577                         /* recursed: which subroutines have we recursed into */
3578                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3579 {
3580     dVAR;
3581     /* There must be at least this number of characters to match */
3582     SSize_t min = 0;
3583     I32 pars = 0, code;
3584     regnode *scan = *scanp, *next;
3585     SSize_t delta = 0;
3586     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3587     int is_inf_internal = 0;            /* The studied chunk is infinite */
3588     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3589     scan_data_t data_fake;
3590     SV *re_trie_maxbuff = NULL;
3591     regnode *first_non_open = scan;
3592     SSize_t stopmin = SSize_t_MAX;
3593     scan_frame *frame = NULL;
3594     GET_RE_DEBUG_FLAGS_DECL;
3595
3596     PERL_ARGS_ASSERT_STUDY_CHUNK;
3597
3598 #ifdef DEBUGGING
3599     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3600 #endif
3601     if ( depth == 0 ) {
3602         while (first_non_open && OP(first_non_open) == OPEN)
3603             first_non_open=regnext(first_non_open);
3604     }
3605
3606
3607   fake_study_recurse:
3608     while ( scan && OP(scan) != END && scan < last ){
3609         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3610                                    node length to get a real minimum (because
3611                                    the folded version may be shorter) */
3612         bool unfolded_multi_char = FALSE;
3613         /* Peephole optimizer: */
3614         DEBUG_OPTIMISE_MORE_r(
3615         {
3616             PerlIO_printf(Perl_debug_log,
3617                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3618                 ((int) depth*2), "", (long)stopparen,
3619                 (unsigned long)depth, (unsigned long)recursed_depth);
3620             if (recursed_depth) {
3621                 U32 i;
3622                 U32 j;
3623                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3624                     PerlIO_printf(Perl_debug_log,"[");
3625                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3626                         PerlIO_printf(Perl_debug_log,"%d",
3627                             PAREN_TEST(RExC_study_chunk_recursed +
3628                                        (j * RExC_study_chunk_recursed_bytes), i)
3629                             ? 1 : 0
3630                         );
3631                     PerlIO_printf(Perl_debug_log,"]");
3632                 }
3633             }
3634             PerlIO_printf(Perl_debug_log,"\n");
3635         }
3636         );
3637         DEBUG_STUDYDATA("Peep:", data, depth);
3638         DEBUG_PEEP("Peep", scan, depth);
3639
3640
3641         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3642          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3643          * by a different invocation of reg() -- Yves
3644          */
3645         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3646
3647         /* Follow the next-chain of the current node and optimize
3648            away all the NOTHINGs from it.  */
3649         if (OP(scan) != CURLYX) {
3650             const int max = (reg_off_by_arg[OP(scan)]
3651                        ? I32_MAX
3652                        /* I32 may be smaller than U16 on CRAYs! */
3653                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3654             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3655             int noff;
3656             regnode *n = scan;
3657
3658             /* Skip NOTHING and LONGJMP. */
3659             while ((n = regnext(n))
3660                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3661                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3662                    && off + noff < max)
3663                 off += noff;
3664             if (reg_off_by_arg[OP(scan)])
3665                 ARG(scan) = off;
3666             else
3667                 NEXT_OFF(scan) = off;
3668         }
3669
3670
3671
3672         /* The principal pseudo-switch.  Cannot be a switch, since we
3673            look into several different things.  */
3674         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3675                    || OP(scan) == IFTHEN) {
3676             next = regnext(scan);
3677             code = OP(scan);
3678             /* demq: the op(next)==code check is to see if we have
3679              * "branch-branch" AFAICT */
3680
3681             if (OP(next) == code || code == IFTHEN) {
3682                 /* NOTE - There is similar code to this block below for
3683                  * handling TRIE nodes on a re-study.  If you change stuff here
3684                  * check there too. */
3685                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3686                 regnode_ssc accum;
3687                 regnode * const startbranch=scan;
3688
3689                 if (flags & SCF_DO_SUBSTR) {
3690                     /* Cannot merge strings after this. */
3691                     scan_commit(pRExC_state, data, minlenp, is_inf);
3692                 }
3693
3694                 if (flags & SCF_DO_STCLASS)
3695                     ssc_init_zero(pRExC_state, &accum);
3696
3697                 while (OP(scan) == code) {
3698                     SSize_t deltanext, minnext, fake;
3699                     I32 f = 0;
3700                     regnode_ssc this_class;
3701
3702                     num++;
3703                     data_fake.flags = 0;
3704                     if (data) {
3705                         data_fake.whilem_c = data->whilem_c;
3706                         data_fake.last_closep = data->last_closep;
3707                     }
3708                     else
3709                         data_fake.last_closep = &fake;
3710
3711                     data_fake.pos_delta = delta;
3712                     next = regnext(scan);
3713                     scan = NEXTOPER(scan);
3714                     if (code != BRANCH)
3715                         scan = NEXTOPER(scan);
3716                     if (flags & SCF_DO_STCLASS) {
3717                         ssc_init(pRExC_state, &this_class);
3718                         data_fake.start_class = &this_class;
3719                         f = SCF_DO_STCLASS_AND;
3720                     }
3721                     if (flags & SCF_WHILEM_VISITED_POS)
3722                         f |= SCF_WHILEM_VISITED_POS;
3723
3724                     /* we suppose the run is continuous, last=next...*/
3725                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3726                                       &deltanext, next, &data_fake, stopparen,
3727                                       recursed_depth, NULL, f,depth+1);
3728                     if (min1 > minnext)
3729                         min1 = minnext;
3730                     if (deltanext == SSize_t_MAX) {
3731                         is_inf = is_inf_internal = 1;
3732                         max1 = SSize_t_MAX;
3733                     } else if (max1 < minnext + deltanext)
3734                         max1 = minnext + deltanext;
3735                     scan = next;
3736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737                         pars++;
3738                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3739                         if ( stopmin > minnext)
3740                             stopmin = min + min1;
3741                         flags &= ~SCF_DO_SUBSTR;
3742                         if (data)
3743                             data->flags |= SCF_SEEN_ACCEPT;
3744                     }
3745                     if (data) {
3746                         if (data_fake.flags & SF_HAS_EVAL)
3747                             data->flags |= SF_HAS_EVAL;
3748                         data->whilem_c = data_fake.whilem_c;
3749                     }
3750                     if (flags & SCF_DO_STCLASS)
3751                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3752                 }
3753                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3754                     min1 = 0;
3755                 if (flags & SCF_DO_SUBSTR) {
3756                     data->pos_min += min1;
3757                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3758                         data->pos_delta = SSize_t_MAX;
3759                     else
3760                         data->pos_delta += max1 - min1;
3761                     if (max1 != min1 || is_inf)
3762                         data->longest = &(data->longest_float);
3763                 }
3764                 min += min1;
3765                 if (delta == SSize_t_MAX
3766                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3767                     delta = SSize_t_MAX;
3768                 else
3769                     delta += max1 - min1;
3770                 if (flags & SCF_DO_STCLASS_OR) {
3771                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3772                     if (min1) {
3773                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3774                         flags &= ~SCF_DO_STCLASS;
3775                     }
3776                 }
3777                 else if (flags & SCF_DO_STCLASS_AND) {
3778                     if (min1) {
3779                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3780                         flags &= ~SCF_DO_STCLASS;
3781                     }
3782                     else {
3783                         /* Switch to OR mode: cache the old value of
3784                          * data->start_class */
3785                         INIT_AND_WITHP;
3786                         StructCopy(data->start_class, and_withp, regnode_ssc);
3787                         flags &= ~SCF_DO_STCLASS_AND;
3788                         StructCopy(&accum, data->start_class, regnode_ssc);
3789                         flags |= SCF_DO_STCLASS_OR;
3790                     }
3791                 }
3792
3793                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3794                         OP( startbranch ) == BRANCH )
3795                 {
3796                 /* demq.
3797
3798                    Assuming this was/is a branch we are dealing with: 'scan'
3799                    now points at the item that follows the branch sequence,
3800                    whatever it is. We now start at the beginning of the
3801                    sequence and look for subsequences of
3802
3803                    BRANCH->EXACT=>x1
3804                    BRANCH->EXACT=>x2
3805                    tail
3806
3807                    which would be constructed from a pattern like
3808                    /A|LIST|OF|WORDS/
3809
3810                    If we can find such a subsequence we need to turn the first
3811                    element into a trie and then add the subsequent branch exact
3812                    strings to the trie.
3813
3814                    We have two cases
3815
3816                      1. patterns where the whole set of branches can be
3817                         converted.
3818
3819                      2. patterns where only a subset can be converted.
3820
3821                    In case 1 we can replace the whole set with a single regop
3822                    for the trie. In case 2 we need to keep the start and end
3823                    branches so
3824
3825                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3826                      becomes BRANCH TRIE; BRANCH X;
3827
3828                   There is an additional case, that being where there is a
3829                   common prefix, which gets split out into an EXACT like node
3830                   preceding the TRIE node.
3831
3832                   If x(1..n)==tail then we can do a simple trie, if not we make
3833                   a "jump" trie, such that when we match the appropriate word
3834                   we "jump" to the appropriate tail node. Essentially we turn
3835                   a nested if into a case structure of sorts.
3836
3837                 */
3838
3839                     int made=0;
3840                     if (!re_trie_maxbuff) {
3841                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3842                         if (!SvIOK(re_trie_maxbuff))
3843                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3844                     }
3845                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3846                         regnode *cur;
3847                         regnode *first = (regnode *)NULL;
3848                         regnode *last = (regnode *)NULL;
3849                         regnode *tail = scan;
3850                         U8 trietype = 0;
3851                         U32 count=0;
3852
3853 #ifdef DEBUGGING
3854                         SV * const mysv = sv_newmortal();   /* for dumping */
3855 #endif
3856                         /* var tail is used because there may be a TAIL
3857                            regop in the way. Ie, the exacts will point to the
3858                            thing following the TAIL, but the last branch will
3859                            point at the TAIL. So we advance tail. If we
3860                            have nested (?:) we may have to move through several
3861                            tails.
3862                          */
3863
3864                         while ( OP( tail ) == TAIL ) {
3865                             /* this is the TAIL generated by (?:) */
3866                             tail = regnext( tail );
3867                         }
3868
3869
3870                         DEBUG_TRIE_COMPILE_r({
3871                             regprop(RExC_rx, mysv, tail, NULL);
3872                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3873                               (int)depth * 2 + 2, "",
3874                               "Looking for TRIE'able sequences. Tail node is: ",
3875                               SvPV_nolen_const( mysv )
3876                             );
3877                         });
3878
3879                         /*
3880
3881                             Step through the branches
3882                                 cur represents each branch,
3883                                 noper is the first thing to be matched as part
3884                                       of that branch
3885                                 noper_next is the regnext() of that node.
3886
3887                             We normally handle a case like this
3888                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3889                             support building with NOJUMPTRIE, which restricts
3890                             the trie logic to structures like /FOO|BAR/.
3891
3892                             If noper is a trieable nodetype then the branch is
3893                             a possible optimization target. If we are building
3894                             under NOJUMPTRIE then we require that noper_next is
3895                             the same as scan (our current position in the regex
3896                             program).
3897
3898                             Once we have two or more consecutive such branches
3899                             we can create a trie of the EXACT's contents and
3900                             stitch it in place into the program.
3901
3902                             If the sequence represents all of the branches in
3903                             the alternation we replace the entire thing with a
3904                             single TRIE node.
3905
3906                             Otherwise when it is a subsequence we need to
3907                             stitch it in place and replace only the relevant
3908                             branches. This means the first branch has to remain
3909                             as it is used by the alternation logic, and its
3910                             next pointer, and needs to be repointed at the item
3911                             on the branch chain following the last branch we
3912                             have optimized away.
3913
3914                             This could be either a BRANCH, in which case the
3915                             subsequence is internal, or it could be the item
3916                             following the branch sequence in which case the
3917                             subsequence is at the end (which does not
3918                             necessarily mean the first node is the start of the
3919                             alternation).
3920
3921                             TRIE_TYPE(X) is a define which maps the optype to a
3922                             trietype.
3923
3924                                 optype          |  trietype
3925                                 ----------------+-----------
3926                                 NOTHING         | NOTHING
3927                                 EXACT           | EXACT
3928                                 EXACTFU         | EXACTFU
3929                                 EXACTFU_SS      | EXACTFU
3930                                 EXACTFA         | EXACTFA
3931
3932
3933                         */
3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3935                        ( EXACT == (X) )   ? EXACT :        \
3936                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3937                        ( EXACTFA == (X) ) ? EXACTFA :        \
3938                        0 )
3939
3940                         /* dont use tail as the end marker for this traverse */
3941                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3942                             regnode * const noper = NEXTOPER( cur );
3943                             U8 noper_type = OP( noper );
3944                             U8 noper_trietype = TRIE_TYPE( noper_type );
3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3946                             regnode * const noper_next = regnext( noper );
3947                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3948                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3949 #endif
3950
3951                             DEBUG_TRIE_COMPILE_r({
3952                                 regprop(RExC_rx, mysv, cur, NULL);
3953                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3954                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3955
3956                                 regprop(RExC_rx, mysv, noper, NULL);
3957                                 PerlIO_printf( Perl_debug_log, " -> %s",
3958                                     SvPV_nolen_const(mysv));
3959
3960                                 if ( noper_next ) {
3961                                   regprop(RExC_rx, mysv, noper_next, NULL);
3962                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3963                                     SvPV_nolen_const(mysv));
3964                                 }
3965                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3966                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3967                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3968                                 );
3969                             });
3970
3971                             /* Is noper a trieable nodetype that can be merged
3972                              * with the current trie (if there is one)? */
3973                             if ( noper_trietype
3974                                   &&
3975                                   (
3976                                         ( noper_trietype == NOTHING)
3977                                         || ( trietype == NOTHING )
3978                                         || ( trietype == noper_trietype )
3979                                   )
3980 #ifdef NOJUMPTRIE
3981                                   && noper_next == tail
3982 #endif
3983                                   && count < U16_MAX)
3984                             {
3985                                 /* Handle mergable triable node Either we are
3986                                  * the first node in a new trieable sequence,
3987                                  * in which case we do some bookkeeping,
3988                                  * otherwise we update the end pointer. */
3989                                 if ( !first ) {
3990                                     first = cur;
3991                                     if ( noper_trietype == NOTHING ) {
3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3993                                         regnode * const noper_next = regnext( noper );
3994                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3995                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3996 #endif
3997
3998                                         if ( noper_next_trietype ) {
3999                                             trietype = noper_next_trietype;
4000                                         } else if (noper_next_type)  {
4001                                             /* a NOTHING regop is 1 regop wide.
4002                                              * We need at least two for a trie
4003                                              * so we can't merge this in */
4004                                             first = NULL;
4005                                         }
4006                                     } else {
4007                                         trietype = noper_trietype;
4008                                     }
4009                                 } else {
4010                                     if ( trietype == NOTHING )
4011                                         trietype = noper_trietype;
4012                                     last = cur;
4013                                 }
4014                                 if (first)
4015                                     count++;
4016                             } /* end handle mergable triable node */
4017                             else {
4018                                 /* handle unmergable node -
4019                                  * noper may either be a triable node which can
4020                                  * not be tried together with the current trie,
4021                                  * or a non triable node */
4022                                 if ( last ) {
4023                                     /* If last is set and trietype is not
4024                                      * NOTHING then we have found at least two
4025                                      * triable branch sequences in a row of a
4026                                      * similar trietype so we can turn them
4027                                      * into a trie. If/when we allow NOTHING to
4028                                      * start a trie sequence this condition
4029                                      * will be required, and it isn't expensive
4030                                      * so we leave it in for now. */
4031                                     if ( trietype && trietype != NOTHING )
4032                                         make_trie( pRExC_state,
4033                                                 startbranch, first, cur, tail,
4034                                                 count, trietype, depth+1 );
4035                                     last = NULL; /* note: we clear/update
4036                                                     first, trietype etc below,
4037                                                     so we dont do it here */
4038                                 }
4039                                 if ( noper_trietype
4040 #ifdef NOJUMPTRIE
4041                                      && noper_next == tail
4042 #endif
4043                                 ){
4044                                     /* noper is triable, so we can start a new
4045                                      * trie sequence */
4046                                     count = 1;
4047                                     first = cur;
4048                                     trietype = noper_trietype;
4049                                 } else if (first) {
4050                                     /* if we already saw a first but the
4051                                      * current node is not triable then we have
4052                                      * to reset the first information. */
4053                                     count = 0;
4054                                     first = NULL;
4055                                     trietype = 0;
4056                                 }
4057                             } /* end handle unmergable node */
4058                         } /* loop over branches */
4059                         DEBUG_TRIE_COMPILE_r({
4060                             regprop(RExC_rx, mysv, cur, NULL);
4061                             PerlIO_printf( Perl_debug_log,
4062                               "%*s- %s (%d) <SCAN FINISHED>\n",
4063                               (int)depth * 2 + 2,
4064                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4065
4066                         });
4067                         if ( last && trietype ) {
4068                             if ( trietype != NOTHING ) {
4069                                 /* the last branch of the sequence was part of
4070                                  * a trie, so we have to construct it here
4071                                  * outside of the loop */
4072                                 made= make_trie( pRExC_state, startbranch,
4073                                                  first, scan, tail, count,
4074                                                  trietype, depth+1 );
4075 #ifdef TRIE_STUDY_OPT
4076                                 if ( ((made == MADE_EXACT_TRIE &&
4077                                      startbranch == first)
4078                                      || ( first_non_open == first )) &&
4079                                      depth==0 ) {
4080                                     flags |= SCF_TRIE_RESTUDY;
4081                                     if ( startbranch == first
4082                                          && scan == tail )
4083                                     {
4084                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4085                                     }
4086                                 }
4087 #endif
4088                             } else {
4089                                 /* at this point we know whatever we have is a
4090                                  * NOTHING sequence/branch AND if 'startbranch'
4091                                  * is 'first' then we can turn the whole thing
4092                                  * into a NOTHING
4093                                  */
4094                                 if ( startbranch == first ) {
4095                                     regnode *opt;
4096                                     /* the entire thing is a NOTHING sequence,
4097                                      * something like this: (?:|) So we can
4098                                      * turn it into a plain NOTHING op. */
4099                                     DEBUG_TRIE_COMPILE_r({
4100                                         regprop(RExC_rx, mysv, cur, NULL);
4101                                         PerlIO_printf( Perl_debug_log,
4102                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4103                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4104
4105                                     });
4106                                     OP(startbranch)= NOTHING;
4107                                     NEXT_OFF(startbranch)= tail - startbranch;
4108                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4109                                         OP(opt)= OPTIMIZED;
4110                                 }
4111                             }
4112                         } /* end if ( last) */
4113                     } /* TRIE_MAXBUF is non zero */
4114
4115                 } /* do trie */
4116
4117             }
4118             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4119                 scan = NEXTOPER(NEXTOPER(scan));
4120             } else                      /* single branch is optimized. */
4121                 scan = NEXTOPER(scan);
4122             continue;
4123         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4124             scan_frame *newframe = NULL;
4125             I32 paren;
4126             regnode *start;
4127             regnode *end;
4128             U32 my_recursed_depth= recursed_depth;
4129
4130             if (OP(scan) != SUSPEND) {
4131                 /* set the pointer */
4132                 if (OP(scan) == GOSUB) {
4133                     paren = ARG(scan);
4134                     RExC_recurse[ARG2L(scan)] = scan;
4135                     start = RExC_open_parens[paren-1];
4136                     end   = RExC_close_parens[paren-1];
4137                 } else {
4138                     paren = 0;
4139                     start = RExC_rxi->program + 1;
4140                     end   = RExC_opend;
4141                 }
4142                 if (!recursed_depth
4143                     ||
4144                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4145                 ) {
4146                     if (!recursed_depth) {
4147                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4148                     } else {
4149                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4150                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed_bytes, U8);
4152                     }
4153                     /* we havent recursed into this paren yet, so recurse into it */
4154                     DEBUG_STUDYDATA("set:", data,depth);
4155                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4156                     my_recursed_depth= recursed_depth + 1;
4157                     Newx(newframe,1,scan_frame);
4158                 } else {
4159                     DEBUG_STUDYDATA("inf:", data,depth);
4160                     /* some form of infinite recursion, assume infinite length
4161                      * */
4162                     if (flags & SCF_DO_SUBSTR) {
4163                         scan_commit(pRExC_state, data, minlenp, is_inf);
4164                         data->longest = &(data->longest_float);
4165                     }
4166                     is_inf = is_inf_internal = 1;
4167                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4168                         ssc_anything(data->start_class);
4169                     flags &= ~SCF_DO_STCLASS;
4170                 }
4171             } else {
4172                 Newx(newframe,1,scan_frame);
4173                 paren = stopparen;
4174                 start = scan+2;
4175                 end = regnext(scan);
4176             }
4177             if (newframe) {
4178                 assert(start);
4179                 assert(end);
4180                 SAVEFREEPV(newframe);
4181                 newframe->next = regnext(scan);
4182                 newframe->last = last;
4183                 newframe->stop = stopparen;
4184                 newframe->prev = frame;
4185                 newframe->prev_recursed_depth = recursed_depth;
4186
4187                 DEBUG_STUDYDATA("frame-new:",data,depth);
4188                 DEBUG_PEEP("fnew", scan, depth);
4189
4190                 frame = newframe;
4191                 scan =  start;
4192                 stopparen = paren;
4193                 last = end;
4194                 depth = depth + 1;
4195                 recursed_depth= my_recursed_depth;
4196
4197                 continue;
4198             }
4199         }
4200         else if (OP(scan) == EXACT) {
4201             SSize_t l = STR_LEN(scan);
4202             UV uc;
4203             if (UTF) {
4204                 const U8 * const s = (U8*)STRING(scan);
4205                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4206                 l = utf8_length(s, s + l);
4207             } else {
4208                 uc = *((U8*)STRING(scan));
4209             }
4210             min += l;
4211             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4212                 /* The code below prefers earlier match for fixed
4213                    offset, later match for variable offset.  */
4214                 if (data->last_end == -1) { /* Update the start info. */
4215                     data->last_start_min = data->pos_min;
4216                     data->last_start_max = is_inf
4217                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4218                 }
4219                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4220                 if (UTF)
4221                     SvUTF8_on(data->last_found);
4222                 {
4223                     SV * const sv = data->last_found;
4224                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4225                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4226                     if (mg && mg->mg_len >= 0)
4227                         mg->mg_len += utf8_length((U8*)STRING(scan),
4228                                               (U8*)STRING(scan)+STR_LEN(scan));
4229                 }
4230                 data->last_end = data->pos_min + l;
4231                 data->pos_min += l; /* As in the first entry. */
4232                 data->flags &= ~SF_BEFORE_EOL;
4233             }
4234
4235             /* ANDing the code point leaves at most it, and not in locale, and
4236              * can't match null string */
4237             if (flags & SCF_DO_STCLASS_AND) {
4238                 ssc_cp_and(data->start_class, uc);
4239                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4240                 ssc_clear_locale(data->start_class);
4241             }
4242             else if (flags & SCF_DO_STCLASS_OR) {
4243                 ssc_add_cp(data->start_class, uc);
4244                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4245
4246                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4247                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4248             }
4249             flags &= ~SCF_DO_STCLASS;
4250         }
4251         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4252             SSize_t l = STR_LEN(scan);
4253             UV uc = *((U8*)STRING(scan));
4254             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4255                                                      separate code points */
4256
4257             /* Search for fixed substrings supports EXACT only. */
4258             if (flags & SCF_DO_SUBSTR) {
4259                 assert(data);
4260                 scan_commit(pRExC_state, data, minlenp, is_inf);
4261             }
4262             if (UTF) {
4263                 const U8 * const s = (U8 *)STRING(scan);
4264                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4265                 l = utf8_length(s, s + l);
4266             }
4267             if (unfolded_multi_char) {
4268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4269             }
4270             min += l - min_subtract;
4271             assert (min >= 0);
4272             delta += min_subtract;
4273             if (flags & SCF_DO_SUBSTR) {
4274                 data->pos_min += l - min_subtract;
4275                 if (data->pos_min < 0) {
4276                     data->pos_min = 0;
4277                 }
4278                 data->pos_delta += min_subtract;
4279                 if (min_subtract) {
4280                     data->longest = &(data->longest_float);
4281                 }
4282             }
4283             if (OP(scan) == EXACTFL) {
4284
4285                 /* We don't know what the folds are; it could be anything. XXX
4286                  * Actually, we only support UTF-8 encoding for code points
4287                  * above Latin1, so we could know what those folds are. */
4288                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4289                                                        0,
4290                                                        UV_MAX);
4291             }
4292             else {  /* Non-locale EXACTFish */
4293                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4294                 if (flags & SCF_DO_STCLASS_AND) {
4295                     ssc_clear_locale(data->start_class);
4296                 }
4297                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4298                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4299                                                        know if anything folds
4300                                                        with this */
4301                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4302                                                            PL_fold_latin1[uc]);
4303                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4304                                                       legal under /iaa */
4305                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4306                                 EXACTF_invlist
4307                                     = add_cp_to_invlist(EXACTF_invlist,
4308                                                 LATIN_SMALL_LETTER_SHARP_S);
4309                             }
4310                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4311                                 EXACTF_invlist
4312                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4313                                 EXACTF_invlist
4314                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4315                             }
4316                         }
4317
4318                         /* We also know if there are above-Latin1 code points
4319                          * that fold to this (none legal for ASCII and /iaa) */
4320                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4321                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4322                         {
4323                             /* XXX We could know exactly what does fold to this
4324                              * if the reverse folds are loaded, as currently in
4325                              * S_regclass() */
4326                             _invlist_union(EXACTF_invlist,
4327                                            PL_AboveLatin1,
4328                                            &EXACTF_invlist);
4329                         }
4330                     }
4331                 }
4332                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4333                            know what participates in folds with this, so have
4334                            to assume anything could */
4335
4336                     /* XXX We could know exactly what does fold to this if the
4337                      * reverse folds are loaded, as currently in S_regclass().
4338                      * But we do know that under /iaa nothing in the ASCII
4339                      * range can participate */
4340                     if (OP(scan) == EXACTFA) {
4341                         _invlist_union_complement_2nd(EXACTF_invlist,
4342                                                       PL_XPosix_ptrs[_CC_ASCII],
4343                                                       &EXACTF_invlist);
4344                     }
4345                     else {
4346                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4347                                                                0, UV_MAX);
4348                     }
4349                 }
4350             }
4351             if (flags & SCF_DO_STCLASS_AND) {
4352                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4353                 ANYOF_POSIXL_ZERO(data->start_class);
4354                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4355             }
4356             else if (flags & SCF_DO_STCLASS_OR) {
4357                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4358                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4359
4360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4361                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4362             }
4363             flags &= ~SCF_DO_STCLASS;
4364             SvREFCNT_dec(EXACTF_invlist);
4365         }
4366         else if (REGNODE_VARIES(OP(scan))) {
4367             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4368             I32 fl = 0, f = flags;
4369             regnode * const oscan = scan;
4370             regnode_ssc this_class;
4371             regnode_ssc *oclass = NULL;
4372             I32 next_is_eval = 0;
4373
4374             switch (PL_regkind[OP(scan)]) {
4375             case WHILEM:                /* End of (?:...)* . */
4376                 scan = NEXTOPER(scan);
4377                 goto finish;
4378             case PLUS:
4379                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4380                     next = NEXTOPER(scan);
4381                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4382                         mincount = 1;
4383                         maxcount = REG_INFTY;
4384                         next = regnext(scan);
4385                         scan = NEXTOPER(scan);
4386                         goto do_curly;
4387                     }
4388                 }
4389                 if (flags & SCF_DO_SUBSTR)
4390                     data->pos_min++;
4391                 min++;
4392                 /* Fall through. */
4393             case STAR:
4394                 if (flags & SCF_DO_STCLASS) {
4395                     mincount = 0;
4396                     maxcount = REG_INFTY;
4397                     next = regnext(scan);
4398                     scan = NEXTOPER(scan);
4399                     goto do_curly;
4400                 }
4401                 if (flags & SCF_DO_SUBSTR) {
4402                     scan_commit(pRExC_state, data, minlenp, is_inf);
4403                     /* Cannot extend fixed substrings */
4404                     data->longest = &(data->longest_float);
4405                 }
4406                 is_inf = is_inf_internal = 1;
4407                 scan = regnext(scan);
4408                 goto optimize_curly_tail;
4409             case CURLY:
4410                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4411                     && (scan->flags == stopparen))
4412                 {
4413                     mincount = 1;
4414                     maxcount = 1;
4415                 } else {
4416                     mincount = ARG1(scan);
4417                     maxcount = ARG2(scan);
4418                 }
4419                 next = regnext(scan);
4420                 if (OP(scan) == CURLYX) {
4421                     I32 lp = (data ? *(data->last_closep) : 0);
4422                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4423                 }
4424                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4425                 next_is_eval = (OP(scan) == EVAL);
4426               do_curly:
4427                 if (flags & SCF_DO_SUBSTR) {
4428                     if (mincount == 0)
4429                         scan_commit(pRExC_state, data, minlenp, is_inf);
4430                     /* Cannot extend fixed substrings */
4431                     pos_before = data->pos_min;
4432                 }
4433                 if (data) {
4434                     fl = data->flags;
4435                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4436                     if (is_inf)
4437                         data->flags |= SF_IS_INF;
4438                 }
4439                 if (flags & SCF_DO_STCLASS) {
4440                     ssc_init(pRExC_state, &this_class);
4441                     oclass = data->start_class;
4442                     data->start_class = &this_class;
4443                     f |= SCF_DO_STCLASS_AND;
4444                     f &= ~SCF_DO_STCLASS_OR;
4445                 }
4446                 /* Exclude from super-linear cache processing any {n,m}
4447                    regops for which the combination of input pos and regex
4448                    pos is not enough information to determine if a match
4449                    will be possible.
4450
4451                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4452                    regex pos at the \s*, the prospects for a match depend not
4453                    only on the input position but also on how many (bar\s*)
4454                    repeats into the {4,8} we are. */
4455                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4456                     f &= ~SCF_WHILEM_VISITED_POS;
4457
4458                 /* This will finish on WHILEM, setting scan, or on NULL: */
4459                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4460                                   last, data, stopparen, recursed_depth, NULL,
4461                                   (mincount == 0
4462                                    ? (f & ~SCF_DO_SUBSTR)
4463                                    : f)
4464                                   ,depth+1);
4465
4466                 if (flags & SCF_DO_STCLASS)
4467                     data->start_class = oclass;
4468                 if (mincount == 0 || minnext == 0) {
4469                     if (flags & SCF_DO_STCLASS_OR) {
4470                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4471                     }
4472                     else if (flags & SCF_DO_STCLASS_AND) {
4473                         /* Switch to OR mode: cache the old value of
4474                          * data->start_class */
4475                         INIT_AND_WITHP;
4476                         StructCopy(data->start_class, and_withp, regnode_ssc);
4477                         flags &= ~SCF_DO_STCLASS_AND;
4478                         StructCopy(&this_class, data->start_class, regnode_ssc);
4479                         flags |= SCF_DO_STCLASS_OR;
4480                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4481                     }
4482                 } else {                /* Non-zero len */
4483                     if (flags & SCF_DO_STCLASS_OR) {
4484                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4485                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4486                     }
4487                     else if (flags & SCF_DO_STCLASS_AND)
4488                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489                     flags &= ~SCF_DO_STCLASS;
4490                 }
4491                 if (!scan)              /* It was not CURLYX, but CURLY. */
4492                     scan = next;
4493                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4494                     /* ? quantifier ok, except for (?{ ... }) */
4495                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4496                     && (minnext == 0) && (deltanext == 0)
4497                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4498                     && maxcount <= REG_INFTY/3) /* Complement check for big
4499                                                    count */
4500                 {
4501                     /* Fatal warnings may leak the regexp without this: */
4502                     SAVEFREESV(RExC_rx_sv);
4503                     ckWARNreg(RExC_parse,
4504                             "Quantifier unexpected on zero-length expression");
4505                     (void)ReREFCNT_inc(RExC_rx_sv);
4506                 }
4507
4508                 min += minnext * mincount;
4509                 is_inf_internal |= deltanext == SSize_t_MAX
4510                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4511                 is_inf |= is_inf_internal;
4512                 if (is_inf) {
4513                     delta = SSize_t_MAX;
4514                 } else {
4515                     delta += (minnext + deltanext) * maxcount
4516                              - minnext * mincount;
4517                 }
4518                 /* Try powerful optimization CURLYX => CURLYN. */
4519                 if (  OP(oscan) == CURLYX && data
4520                       && data->flags & SF_IN_PAR
4521                       && !(data->flags & SF_HAS_EVAL)
4522                       && !deltanext && minnext == 1 ) {
4523                     /* Try to optimize to CURLYN.  */
4524                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4525                     regnode * const nxt1 = nxt;
4526 #ifdef DEBUGGING
4527                     regnode *nxt2;
4528 #endif
4529
4530                     /* Skip open. */
4531                     nxt = regnext(nxt);
4532                     if (!REGNODE_SIMPLE(OP(nxt))
4533                         && !(PL_regkind[OP(nxt)] == EXACT
4534                              && STR_LEN(nxt) == 1))
4535                         goto nogo;
4536 #ifdef DEBUGGING
4537                     nxt2 = nxt;
4538 #endif
4539                     nxt = regnext(nxt);
4540                     if (OP(nxt) != CLOSE)
4541                         goto nogo;
4542                     if (RExC_open_parens) {
4543                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4544                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4545                     }
4546                     /* Now we know that nxt2 is the only contents: */
4547                     oscan->flags = (U8)ARG(nxt);
4548                     OP(oscan) = CURLYN;
4549                     OP(nxt1) = NOTHING; /* was OPEN. */
4550
4551 #ifdef DEBUGGING
4552                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4553                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4554                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4555                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4556                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4557                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4558 #endif
4559                 }
4560               nogo:
4561
4562                 /* Try optimization CURLYX => CURLYM. */
4563                 if (  OP(oscan) == CURLYX && data
4564                       && !(data->flags & SF_HAS_PAR)
4565                       && !(data->flags & SF_HAS_EVAL)
4566                       && !deltanext     /* atom is fixed width */
4567                       && minnext != 0   /* CURLYM can't handle zero width */
4568
4569                          /* Nor characters whose fold at run-time may be
4570                           * multi-character */
4571                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4572                 ) {
4573                     /* XXXX How to optimize if data == 0? */
4574                     /* Optimize to a simpler form.  */
4575                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4576                     regnode *nxt2;
4577
4578                     OP(oscan) = CURLYM;
4579                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4580                             && (OP(nxt2) != WHILEM))
4581                         nxt = nxt2;
4582                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4583                     /* Need to optimize away parenths. */
4584                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4585                         /* Set the parenth number.  */
4586                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4587
4588                         oscan->flags = (U8)ARG(nxt);
4589                         if (RExC_open_parens) {
4590                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4591                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4592                         }
4593                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4594                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4595
4596 #ifdef DEBUGGING
4597                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4598                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4599                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4600                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4601 #endif
4602 #if 0
4603                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4604                             regnode *nnxt = regnext(nxt1);
4605                             if (nnxt == nxt) {
4606                                 if (reg_off_by_arg[OP(nxt1)])
4607                                     ARG_SET(nxt1, nxt2 - nxt1);
4608                                 else if (nxt2 - nxt1 < U16_MAX)
4609                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4610                                 else
4611                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4612                             }
4613                             nxt1 = nnxt;
4614                         }
4615 #endif
4616                         /* Optimize again: */
4617                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4618                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4619                     }
4620                     else
4621                         oscan->flags = 0;
4622                 }
4623                 else if ((OP(oscan) == CURLYX)
4624                          && (flags & SCF_WHILEM_VISITED_POS)
4625                          /* See the comment on a similar expression above.
4626                             However, this time it's not a subexpression
4627                             we care about, but the expression itself. */
4628                          && (maxcount == REG_INFTY)
4629                          && data && ++data->whilem_c < 16) {
4630                     /* This stays as CURLYX, we can put the count/of pair. */
4631                     /* Find WHILEM (as in regexec.c) */
4632                     regnode *nxt = oscan + NEXT_OFF(oscan);
4633
4634                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4635                         nxt += ARG(nxt);
4636                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4637                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4638                 }
4639                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4640                     pars++;
4641                 if (flags & SCF_DO_SUBSTR) {
4642                     SV *last_str = NULL;
4643                     STRLEN last_chrs = 0;
4644                     int counted = mincount != 0;
4645
4646                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4647                                                                   string. */
4648                         SSize_t b = pos_before >= data->last_start_min
4649                             ? pos_before : data->last_start_min;
4650                         STRLEN l;
4651                         const char * const s = SvPV_const(data->last_found, l);
4652                         SSize_t old = b - data->last_start_min;
4653
4654                         if (UTF)
4655                             old = utf8_hop((U8*)s, old) - (U8*)s;
4656                         l -= old;
4657                         /* Get the added string: */
4658                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4659                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4660                                             (U8*)(s + old + l)) : l;
4661                         if (deltanext == 0 && pos_before == b) {
4662                             /* What was added is a constant string */
4663                             if (mincount > 1) {
4664
4665                                 SvGROW(last_str, (mincount * l) + 1);
4666                                 repeatcpy(SvPVX(last_str) + l,
4667                                           SvPVX_const(last_str), l,
4668                                           mincount - 1);
4669                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4670                                 /* Add additional parts. */
4671                                 SvCUR_set(data->last_found,
4672                                           SvCUR(data->last_found) - l);
4673                                 sv_catsv(data->last_found, last_str);
4674                                 {
4675                                     SV * sv = data->last_found;
4676                                     MAGIC *mg =
4677                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4678                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4679                                     if (mg && mg->mg_len >= 0)
4680                                         mg->mg_len += last_chrs * (mincount-1);
4681                                 }
4682                                 last_chrs *= mincount;
4683                                 data->last_end += l * (mincount - 1);
4684                             }
4685                         } else {
4686                             /* start offset must point into the last copy */
4687                             data->last_start_min += minnext * (mincount - 1);
4688                             data->last_start_max += is_inf ? SSize_t_MAX
4689                                 : (maxcount - 1) * (minnext + data->pos_delta);
4690                         }
4691                     }
4692                     /* It is counted once already... */
4693                     data->pos_min += minnext * (mincount - counted);
4694 #if 0
4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4696                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4697                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4698     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4699     (UV)mincount);
4700 if (deltanext != SSize_t_MAX)
4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4702     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4703           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4704 #endif
4705                     if (deltanext == SSize_t_MAX
4706                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4707                         data->pos_delta = SSize_t_MAX;
4708                     else
4709                         data->pos_delta += - counted * deltanext +
4710                         (minnext + deltanext) * maxcount - minnext * mincount;
4711                     if (mincount != maxcount) {
4712                          /* Cannot extend fixed substrings found inside
4713                             the group.  */
4714                         scan_commit(pRExC_state, data, minlenp, is_inf);
4715                         if (mincount && last_str) {
4716                             SV * const sv = data->last_found;
4717                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4718                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4719
4720                             if (mg)
4721                                 mg->mg_len = -1;
4722                             sv_setsv(sv, last_str);
4723                             data->last_end = data->pos_min;
4724                             data->last_start_min = data->pos_min - last_chrs;
4725                             data->last_start_max = is_inf
4726                                 ? SSize_t_MAX
4727                                 : data->pos_min + data->pos_delta - last_chrs;
4728                         }
4729                         data->longest = &(data->longest_float);
4730                     }
4731                     SvREFCNT_dec(last_str);
4732                 }
4733                 if (data && (fl & SF_HAS_EVAL))
4734                     data->flags |= SF_HAS_EVAL;
4735               optimize_curly_tail:
4736                 if (OP(oscan) != CURLYX) {
4737                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4738                            && NEXT_OFF(next))
4739                         NEXT_OFF(oscan) += NEXT_OFF(next);
4740                 }
4741                 continue;
4742
4743             default:
4744 #ifdef DEBUGGING
4745                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4746                                                                     OP(scan));
4747 #endif
4748             case REF:
4749             case CLUMP:
4750                 if (flags & SCF_DO_SUBSTR) {
4751                     /* Cannot expect anything... */
4752                     scan_commit(pRExC_state, data, minlenp, is_inf);
4753                     data->longest = &(data->longest_float);
4754                 }
4755                 is_inf = is_inf_internal = 1;
4756                 if (flags & SCF_DO_STCLASS_OR) {
4757                     if (OP(scan) == CLUMP) {
4758                         /* Actually is any start char, but very few code points
4759                          * aren't start characters */
4760                         ssc_match_all_cp(data->start_class);
4761                     }
4762                     else {
4763                         ssc_anything(data->start_class);
4764                     }
4765                 }
4766                 flags &= ~SCF_DO_STCLASS;
4767                 break;
4768             }
4769         }
4770         else if (OP(scan) == LNBREAK) {
4771             if (flags & SCF_DO_STCLASS) {
4772                 if (flags & SCF_DO_STCLASS_AND) {
4773                     ssc_intersection(data->start_class,
4774                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4775                     ssc_clear_locale(data->start_class);
4776                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4777                 }
4778                 else if (flags & SCF_DO_STCLASS_OR) {
4779                     ssc_union(data->start_class,
4780                               PL_XPosix_ptrs[_CC_VERTSPACE],
4781                               FALSE);
4782                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4783
4784                     /* See commit msg for
4785                      * 749e076fceedeb708a624933726e7989f2302f6a */
4786                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4787                 }
4788                 flags &= ~SCF_DO_STCLASS;
4789             }
4790             min++;
4791             delta++;    /* Because of the 2 char string cr-lf */
4792             if (flags & SCF_DO_SUBSTR) {
4793                 /* Cannot expect anything... */
4794                 scan_commit(pRExC_state, data, minlenp, is_inf);
4795                 data->pos_min += 1;
4796                 data->pos_delta += 1;
4797                 data->longest = &(data->longest_float);
4798             }
4799         }
4800         else if (REGNODE_SIMPLE(OP(scan))) {
4801
4802             if (flags & SCF_DO_SUBSTR) {
4803                 scan_commit(pRExC_state, data, minlenp, is_inf);
4804                 data->pos_min++;
4805             }
4806             min++;
4807             if (flags & SCF_DO_STCLASS) {
4808                 bool invert = 0;
4809                 SV* my_invlist = sv_2mortal(_new_invlist(0));
4810                 U8 namedclass;
4811
4812                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4813                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4814
4815                 /* Some of the logic below assumes that switching
4816                    locale on will only add false positives. */
4817                 switch (OP(scan)) {
4818
4819                 default:
4820 #ifdef DEBUGGING
4821                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4822                                                                      OP(scan));
4823 #endif
4824                 case CANY:
4825                 case SANY:
4826                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4827                         ssc_match_all_cp(data->start_class);
4828                     break;
4829
4830                 case REG_ANY:
4831                     {
4832                         SV* REG_ANY_invlist = _new_invlist(2);
4833                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4834                                                             '\n');
4835                         if (flags & SCF_DO_STCLASS_OR) {
4836                             ssc_union(data->start_class,
4837                                       REG_ANY_invlist,
4838                                       TRUE /* TRUE => invert, hence all but \n
4839                                             */
4840                                       );
4841                         }
4842                         else if (flags & SCF_DO_STCLASS_AND) {
4843                             ssc_intersection(data->start_class,
4844                                              REG_ANY_invlist,
4845                                              TRUE  /* TRUE => invert */
4846                                              );
4847                             ssc_clear_locale(data->start_class);
4848                         }
4849                         SvREFCNT_dec_NN(REG_ANY_invlist);
4850                     }
4851                     break;
4852
4853                 case ANYOF:
4854                     if (flags & SCF_DO_STCLASS_AND)
4855                         ssc_and(pRExC_state, data->start_class,
4856                                 (regnode_charclass *) scan);
4857                     else
4858                         ssc_or(pRExC_state, data->start_class,
4859                                                           (regnode_charclass *) scan);
4860                     break;
4861
4862                 case NPOSIXL:
4863                     invert = 1;
4864                     /* FALL THROUGH */
4865
4866                 case POSIXL:
4867                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4868                     if (flags & SCF_DO_STCLASS_AND) {
4869                         bool was_there = cBOOL(
4870                                           ANYOF_POSIXL_TEST(data->start_class,
4871                                                                  namedclass));
4872                         ANYOF_POSIXL_ZERO(data->start_class);
4873                         if (was_there) {    /* Do an AND */
4874                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4875                         }
4876                         /* No individual code points can now match */
4877                         data->start_class->invlist
4878                                                 = sv_2mortal(_new_invlist(0));
4879                     }
4880                     else {
4881                         int complement = namedclass + ((invert) ? -1 : 1);
4882
4883                         assert(flags & SCF_DO_STCLASS_OR);
4884
4885                         /* If the complement of this class was already there,
4886                          * the result is that they match all code points,
4887                          * (\d + \D == everything).  Remove the classes from
4888                          * future consideration.  Locale is not relevant in
4889                          * this case */
4890                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4891                             ssc_match_all_cp(data->start_class);
4892                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4894                         }
4895                         else {  /* The usual case; just add this class to the
4896                                    existing set */
4897                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4898                         }
4899                     }
4900                     break;
4901
4902                 case NPOSIXA:   /* For these, we always know the exact set of
4903                                    what's matched */
4904                     invert = 1;
4905                     /* FALL THROUGH */
4906                 case POSIXA:
4907                     if (FLAGS(scan) == _CC_ASCII) {
4908                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4909                     }
4910                     else {
4911                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4912                                               PL_XPosix_ptrs[_CC_ASCII],
4913                                               &my_invlist);
4914                     }
4915                     goto join_posix;
4916
4917                 case NPOSIXD:
4918                 case NPOSIXU:
4919                     invert = 1;
4920                     /* FALL THROUGH */
4921                 case POSIXD:
4922                 case POSIXU:
4923                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4924
4925                     /* NPOSIXD matches all upper Latin1 code points unless the
4926                      * target string being matched is UTF-8, which is
4927                      * unknowable until match time.  Since we are going to
4928                      * invert, we want to get rid of all of them so that the
4929                      * inversion will match all */
4930                     if (OP(scan) == NPOSIXD) {
4931                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4932                                           &my_invlist);
4933                     }
4934
4935                   join_posix:
4936
4937                     if (flags & SCF_DO_STCLASS_AND) {
4938                         ssc_intersection(data->start_class, my_invlist, invert);
4939                         ssc_clear_locale(data->start_class);
4940                     }
4941                     else {
4942                         assert(flags & SCF_DO_STCLASS_OR);
4943                         ssc_union(data->start_class, my_invlist, invert);
4944                     }
4945                 }
4946                 if (flags & SCF_DO_STCLASS_OR)
4947                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4948                 flags &= ~SCF_DO_STCLASS;
4949             }
4950         }
4951         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4952             data->flags |= (OP(scan) == MEOL
4953                             ? SF_BEFORE_MEOL
4954                             : SF_BEFORE_SEOL);
4955             scan_commit(pRExC_state, data, minlenp, is_inf);
4956
4957         }
4958         else if (  PL_regkind[OP(scan)] == BRANCHJ
4959                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4960                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4961                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4962             if ( OP(scan) == UNLESSM &&
4963                  scan->flags == 0 &&
4964                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4965                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4966             ) {
4967                 regnode *opt;
4968                 regnode *upto= regnext(scan);
4969                 DEBUG_PARSE_r({
4970                     SV * const mysv_val=sv_newmortal();
4971                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4972
4973                     /*DEBUG_PARSE_MSG("opfail");*/
4974                     regprop(RExC_rx, mysv_val, upto, NULL);
4975                     PerlIO_printf(Perl_debug_log,
4976                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4977                         SvPV_nolen_const(mysv_val),
4978                         (IV)REG_NODE_NUM(upto),
4979                         (IV)(upto - scan)
4980                     );
4981                 });
4982                 OP(scan) = OPFAIL;
4983                 NEXT_OFF(scan) = upto - scan;
4984                 for (opt= scan + 1; opt < upto ; opt++)
4985                     OP(opt) = OPTIMIZED;
4986                 scan= upto;
4987                 continue;
4988             }
4989             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4990                 || OP(scan) == UNLESSM )
4991             {
4992                 /* Negative Lookahead/lookbehind
4993                    In this case we can't do fixed string optimisation.
4994                 */
4995
4996                 SSize_t deltanext, minnext, fake = 0;
4997                 regnode *nscan;
4998                 regnode_ssc intrnl;
4999                 int f = 0;
5000
5001                 data_fake.flags = 0;
5002                 if (data) {
5003                     data_fake.whilem_c = data->whilem_c;
5004                     data_fake.last_closep = data->last_closep;
5005                 }
5006                 else
5007                     data_fake.last_closep = &fake;
5008                 data_fake.pos_delta = delta;
5009                 if ( flags & SCF_DO_STCLASS && !scan->flags
5010                      && OP(scan) == IFMATCH ) { /* Lookahead */
5011                     ssc_init(pRExC_state, &intrnl);
5012                     data_fake.start_class = &intrnl;
5013                     f |= SCF_DO_STCLASS_AND;
5014                 }
5015                 if (flags & SCF_WHILEM_VISITED_POS)
5016                     f |= SCF_WHILEM_VISITED_POS;
5017                 next = regnext(scan);
5018                 nscan = NEXTOPER(NEXTOPER(scan));
5019                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5020                                       last, &data_fake, stopparen,
5021                                       recursed_depth, NULL, f, depth+1);
5022                 if (scan->flags) {
5023                     if (deltanext) {
5024                         FAIL("Variable length lookbehind not implemented");
5025                     }
5026                     else if (minnext > (I32)U8_MAX) {
5027                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5028                               (UV)U8_MAX);
5029                     }
5030                     scan->flags = (U8)minnext;
5031                 }
5032                 if (data) {
5033                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5034                         pars++;
5035                     if (data_fake.flags & SF_HAS_EVAL)
5036                         data->flags |= SF_HAS_EVAL;
5037                     data->whilem_c = data_fake.whilem_c;
5038                 }
5039                 if (f & SCF_DO_STCLASS_AND) {
5040                     if (flags & SCF_DO_STCLASS_OR) {
5041                         /* OR before, AND after: ideally we would recurse with
5042                          * data_fake to get the AND applied by study of the
5043                          * remainder of the pattern, and then derecurse;
5044                          * *** HACK *** for now just treat as "no information".
5045                          * See [perl #56690].
5046                          */
5047                         ssc_init(pRExC_state, data->start_class);
5048                     }  else {
5049                         /* AND before and after: combine and continue */
5050                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5051                     }
5052                 }
5053             }
5054 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5055             else {
5056                 /* Positive Lookahead/lookbehind
5057                    In this case we can do fixed string optimisation,
5058                    but we must be careful about it. Note in the case of
5059                    lookbehind the positions will be offset by the minimum
5060                    length of the pattern, something we won't know about
5061                    until after the recurse.
5062                 */
5063                 SSize_t deltanext, fake = 0;
5064                 regnode *nscan;
5065                 regnode_ssc intrnl;
5066                 int f = 0;
5067                 /* We use SAVEFREEPV so that when the full compile
5068                     is finished perl will clean up the allocated
5069                     minlens when it's all done. This way we don't
5070                     have to worry about freeing them when we know
5071                     they wont be used, which would be a pain.
5072                  */
5073                 SSize_t *minnextp;
5074                 Newx( minnextp, 1, SSize_t );
5075                 SAVEFREEPV(minnextp);
5076
5077                 if (data) {
5078                     StructCopy(data, &data_fake, scan_data_t);
5079                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5080                         f |= SCF_DO_SUBSTR;
5081                         if (scan->flags)
5082                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5083                         data_fake.last_found=newSVsv(data->last_found);
5084                     }
5085                 }
5086                 else
5087                     data_fake.last_closep = &fake;
5088                 data_fake.flags = 0;
5089                 data_fake.pos_delta = delta;
5090                 if (is_inf)
5091                     data_fake.flags |= SF_IS_INF;
5092                 if ( flags & SCF_DO_STCLASS && !scan->flags
5093                      && OP(scan) == IFMATCH ) { /* Lookahead */
5094                     ssc_init(pRExC_state, &intrnl);
5095                     data_fake.start_class = &intrnl;
5096                     f |= SCF_DO_STCLASS_AND;
5097                 }
5098                 if (flags & SCF_WHILEM_VISITED_POS)
5099                     f |= SCF_WHILEM_VISITED_POS;
5100                 next = regnext(scan);
5101                 nscan = NEXTOPER(NEXTOPER(scan));
5102
5103                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5104                                         &deltanext, last, &data_fake,
5105                                         stopparen, recursed_depth, NULL,
5106                                         f,depth+1);
5107                 if (scan->flags) {
5108                     if (deltanext) {
5109                         FAIL("Variable length lookbehind not implemented");
5110                     }
5111                     else if (*minnextp > (I32)U8_MAX) {
5112                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5113                               (UV)U8_MAX);
5114                     }
5115                     scan->flags = (U8)*minnextp;
5116                 }
5117
5118                 *minnextp += min;
5119
5120                 if (f & SCF_DO_STCLASS_AND) {
5121                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5122                 }
5123                 if (data) {
5124                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5125                         pars++;
5126                     if (data_fake.flags & SF_HAS_EVAL)
5127                         data->flags |= SF_HAS_EVAL;
5128                     data->whilem_c = data_fake.whilem_c;
5129                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5130                         if (RExC_rx->minlen<*minnextp)
5131                             RExC_rx->minlen=*minnextp;
5132                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5133                         SvREFCNT_dec_NN(data_fake.last_found);
5134
5135                         if ( data_fake.minlen_fixed != minlenp )
5136                         {
5137                             data->offset_fixed= data_fake.offset_fixed;
5138                             data->minlen_fixed= data_fake.minlen_fixed;
5139                             data->lookbehind_fixed+= scan->flags;
5140                         }
5141                         if ( data_fake.minlen_float != minlenp )
5142                         {
5143                             data->minlen_float= data_fake.minlen_float;
5144                             data->offset_float_min=data_fake.offset_float_min;
5145                             data->offset_float_max=data_fake.offset_float_max;
5146                             data->lookbehind_float+= scan->flags;
5147                         }
5148                     }
5149                 }
5150             }
5151 #endif
5152         }
5153         else if (OP(scan) == OPEN) {
5154             if (stopparen != (I32)ARG(scan))
5155                 pars++;
5156         }
5157         else if (OP(scan) == CLOSE) {
5158             if (stopparen == (I32)ARG(scan)) {
5159                 break;
5160             }
5161             if ((I32)ARG(scan) == is_par) {
5162                 next = regnext(scan);
5163
5164                 if ( next && (OP(next) != WHILEM) && next < last)
5165                     is_par = 0;         /* Disable optimization */
5166             }
5167             if (data)
5168                 *(data->last_closep) = ARG(scan);
5169         }
5170         else if (OP(scan) == EVAL) {
5171                 if (data)
5172                     data->flags |= SF_HAS_EVAL;
5173         }
5174         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5175             if (flags & SCF_DO_SUBSTR) {
5176                 scan_commit(pRExC_state, data, minlenp, is_inf);
5177                 flags &= ~SCF_DO_SUBSTR;
5178             }
5179             if (data && OP(scan)==ACCEPT) {
5180                 data->flags |= SCF_SEEN_ACCEPT;
5181                 if (stopmin > min)
5182                     stopmin = min;
5183             }
5184         }
5185         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5186         {
5187                 if (flags & SCF_DO_SUBSTR) {
5188                     scan_commit(pRExC_state, data, minlenp, is_inf);
5189                     data->longest = &(data->longest_float);
5190                 }
5191                 is_inf = is_inf_internal = 1;
5192                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5193                     ssc_anything(data->start_class);
5194                 flags &= ~SCF_DO_STCLASS;
5195         }
5196         else if (OP(scan) == GPOS) {
5197             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5198                 !(delta || is_inf || (data && data->pos_delta)))
5199             {
5200                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5201                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5202                 if (RExC_rx->gofs < (STRLEN)min)
5203                     RExC_rx->gofs = min;
5204             } else {
5205                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5206                 RExC_rx->gofs = 0;
5207             }
5208         }
5209 #ifdef TRIE_STUDY_OPT
5210 #ifdef FULL_TRIE_STUDY
5211         else if (PL_regkind[OP(scan)] == TRIE) {
5212             /* NOTE - There is similar code to this block above for handling
5213                BRANCH nodes on the initial study.  If you change stuff here
5214                check there too. */
5215             regnode *trie_node= scan;
5216             regnode *tail= regnext(scan);
5217             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5218             SSize_t max1 = 0, min1 = SSize_t_MAX;
5219             regnode_ssc accum;
5220
5221             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5222                 /* Cannot merge strings after this. */
5223                 scan_commit(pRExC_state, data, minlenp, is_inf);
5224             }
5225             if (flags & SCF_DO_STCLASS)
5226                 ssc_init_zero(pRExC_state, &accum);
5227
5228             if (!trie->jump) {
5229                 min1= trie->minlen;
5230                 max1= trie->maxlen;
5231             } else {
5232                 const regnode *nextbranch= NULL;
5233                 U32 word;
5234
5235                 for ( word=1 ; word <= trie->wordcount ; word++)
5236                 {
5237                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5238                     regnode_ssc this_class;
5239
5240                     data_fake.flags = 0;
5241                     if (data) {
5242                         data_fake.whilem_c = data->whilem_c;
5243                         data_fake.last_closep = data->last_closep;
5244                     }
5245                     else
5246                         data_fake.last_closep = &fake;
5247                     data_fake.pos_delta = delta;
5248                     if (flags & SCF_DO_STCLASS) {
5249                         ssc_init(pRExC_state, &this_class);
5250                         data_fake.start_class = &this_class;
5251                         f = SCF_DO_STCLASS_AND;
5252                     }
5253                     if (flags & SCF_WHILEM_VISITED_POS)
5254                         f |= SCF_WHILEM_VISITED_POS;
5255
5256                     if (trie->jump[word]) {
5257                         if (!nextbranch)
5258                             nextbranch = trie_node + trie->jump[0];
5259                         scan= trie_node + trie->jump[word];
5260                         /* We go from the jump point to the branch that follows
5261                            it. Note this means we need the vestigal unused
5262                            branches even though they arent otherwise used. */
5263                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5264                             &deltanext, (regnode *)nextbranch, &data_fake,
5265                             stopparen, recursed_depth, NULL, f,depth+1);
5266                     }
5267                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5268                         nextbranch= regnext((regnode*)nextbranch);
5269
5270                     if (min1 > (SSize_t)(minnext + trie->minlen))
5271                         min1 = minnext + trie->minlen;
5272                     if (deltanext == SSize_t_MAX) {
5273                         is_inf = is_inf_internal = 1;
5274                         max1 = SSize_t_MAX;
5275                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5276                         max1 = minnext + deltanext + trie->maxlen;
5277
5278                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5279                         pars++;
5280                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5281                         if ( stopmin > min + min1)
5282                             stopmin = min + min1;
5283                         flags &= ~SCF_DO_SUBSTR;
5284                         if (data)
5285                             data->flags |= SCF_SEEN_ACCEPT;
5286                     }
5287                     if (data) {
5288                         if (data_fake.flags & SF_HAS_EVAL)
5289                             data->flags |= SF_HAS_EVAL;
5290                         data->whilem_c = data_fake.whilem_c;
5291                     }
5292                     if (flags & SCF_DO_STCLASS)
5293                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5294                 }
5295             }
5296             if (flags & SCF_DO_SUBSTR) {
5297                 data->pos_min += min1;
5298                 data->pos_delta += max1 - min1;
5299                 if (max1 != min1 || is_inf)
5300                     data->longest = &(data->longest_float);
5301             }
5302             min += min1;
5303             delta += max1 - min1;
5304             if (flags & SCF_DO_STCLASS_OR) {
5305                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5306                 if (min1) {
5307                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5308                     flags &= ~SCF_DO_STCLASS;
5309                 }
5310             }
5311             else if (flags & SCF_DO_STCLASS_AND) {
5312                 if (min1) {
5313                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5314                     flags &= ~SCF_DO_STCLASS;
5315                 }
5316                 else {
5317                     /* Switch to OR mode: cache the old value of
5318                      * data->start_class */
5319                     INIT_AND_WITHP;
5320                     StructCopy(data->start_class, and_withp, regnode_ssc);
5321                     flags &= ~SCF_DO_STCLASS_AND;
5322                     StructCopy(&accum, data->start_class, regnode_ssc);
5323                     flags |= SCF_DO_STCLASS_OR;
5324                 }
5325             }
5326             scan= tail;
5327             continue;
5328         }
5329 #else
5330         else if (PL_regkind[OP(scan)] == TRIE) {
5331             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5332             U8*bang=NULL;
5333
5334             min += trie->minlen;
5335             delta += (trie->maxlen - trie->minlen);
5336             flags &= ~SCF_DO_STCLASS; /* xxx */
5337             if (flags & SCF_DO_SUBSTR) {
5338                 /* Cannot expect anything... */
5339                 scan_commit(pRExC_state, data, minlenp, is_inf);
5340                 data->pos_min += trie->minlen;
5341                 data->pos_delta += (trie->maxlen - trie->minlen);
5342                 if (trie->maxlen != trie->minlen)
5343                     data->longest = &(data->longest_float);
5344             }
5345             if (trie->jump) /* no more substrings -- for now /grr*/
5346                flags &= ~SCF_DO_SUBSTR;
5347         }
5348 #endif /* old or new */
5349 #endif /* TRIE_STUDY_OPT */
5350
5351         /* Else: zero-length, ignore. */
5352         scan = regnext(scan);
5353     }
5354     /* If we are exiting a recursion we can unset its recursed bit
5355      * and allow ourselves to enter it again - no danger of an
5356      * infinite loop there.
5357     if (stopparen > -1 && recursed) {
5358         DEBUG_STUDYDATA("unset:", data,depth);
5359         PAREN_UNSET( recursed, stopparen);
5360     }
5361     */
5362     if (frame) {
5363         DEBUG_STUDYDATA("frame-end:",data,depth);
5364         DEBUG_PEEP("fend", scan, depth);
5365         /* restore previous context */
5366         last = frame->last;
5367         scan = frame->next;
5368         stopparen = frame->stop;
5369         recursed_depth = frame->prev_recursed_depth;
5370         depth = depth - 1;
5371
5372         frame = frame->prev;
5373         goto fake_study_recurse;
5374     }
5375
5376   finish:
5377     assert(!frame);
5378     DEBUG_STUDYDATA("pre-fin:",data,depth);
5379
5380     *scanp = scan;
5381     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5382
5383     if (flags & SCF_DO_SUBSTR && is_inf)
5384         data->pos_delta = SSize_t_MAX - data->pos_min;
5385     if (is_par > (I32)U8_MAX)
5386         is_par = 0;
5387     if (is_par && pars==1 && data) {
5388         data->flags |= SF_IN_PAR;
5389         data->flags &= ~SF_HAS_PAR;
5390     }
5391     else if (pars && data) {
5392         data->flags |= SF_HAS_PAR;
5393         data->flags &= ~SF_IN_PAR;
5394     }
5395     if (flags & SCF_DO_STCLASS_OR)
5396         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5397     if (flags & SCF_TRIE_RESTUDY)
5398         data->flags |=  SCF_TRIE_RESTUDY;
5399
5400     DEBUG_STUDYDATA("post-fin:",data,depth);
5401
5402     {
5403         SSize_t final_minlen= min < stopmin ? min : stopmin;
5404
5405         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5406             RExC_maxlen = final_minlen + delta;
5407         }
5408         return final_minlen;
5409     }
5410     /* not-reached */
5411 }
5412
5413 STATIC U32
5414 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5415 {
5416     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5417
5418     PERL_ARGS_ASSERT_ADD_DATA;
5419
5420     Renewc(RExC_rxi->data,
5421            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5422            char, struct reg_data);
5423     if(count)
5424         Renew(RExC_rxi->data->what, count + n, U8);
5425     else
5426         Newx(RExC_rxi->data->what, n, U8);
5427     RExC_rxi->data->count = count + n;
5428     Copy(s, RExC_rxi->data->what + count, n, U8);
5429     return count;
5430 }
5431
5432 /*XXX: todo make this not included in a non debugging perl */
5433 #ifndef PERL_IN_XSUB_RE
5434 void
5435 Perl_reginitcolors(pTHX)
5436 {
5437     dVAR;
5438     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5439     if (s) {
5440         char *t = savepv(s);
5441         int i = 0;
5442         PL_colors[0] = t;
5443         while (++i < 6) {
5444             t = strchr(t, '\t');
5445             if (t) {
5446                 *t = '\0';
5447                 PL_colors[i] = ++t;
5448             }
5449             else
5450                 PL_colors[i] = t = (char *)"";
5451         }
5452     } else {
5453         int i = 0;
5454         while (i < 6)
5455             PL_colors[i++] = (char *)"";
5456     }
5457     PL_colorset = 1;
5458 }
5459 #endif
5460
5461
5462 #ifdef TRIE_STUDY_OPT
5463 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5464     STMT_START {                                            \
5465         if (                                                \
5466               (data.flags & SCF_TRIE_RESTUDY)               \
5467               && ! restudied++                              \
5468         ) {                                                 \
5469             dOsomething;                                    \
5470             goto reStudy;                                   \
5471         }                                                   \
5472     } STMT_END
5473 #else
5474 #define CHECK_RESTUDY_GOTO_butfirst
5475 #endif
5476
5477 /*
5478  * pregcomp - compile a regular expression into internal code
5479  *
5480  * Decides which engine's compiler to call based on the hint currently in
5481  * scope
5482  */
5483
5484 #ifndef PERL_IN_XSUB_RE
5485
5486 /* return the currently in-scope regex engine (or the default if none)  */
5487
5488 regexp_engine const *
5489 Perl_current_re_engine(pTHX)
5490 {
5491     dVAR;
5492
5493     if (IN_PERL_COMPILETIME) {
5494         HV * const table = GvHV(PL_hintgv);
5495         SV **ptr;
5496
5497         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5498             return &PL_core_reg_engine;
5499         ptr = hv_fetchs(table, "regcomp", FALSE);
5500         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5501             return &PL_core_reg_engine;
5502         return INT2PTR(regexp_engine*,SvIV(*ptr));
5503     }
5504     else {
5505         SV *ptr;
5506         if (!PL_curcop->cop_hints_hash)
5507             return &PL_core_reg_engine;
5508         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5509         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5510             return &PL_core_reg_engine;
5511         return INT2PTR(regexp_engine*,SvIV(ptr));
5512     }
5513 }
5514
5515
5516 REGEXP *
5517 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5518 {
5519     dVAR;
5520     regexp_engine const *eng = current_re_engine();
5521     GET_RE_DEBUG_FLAGS_DECL;
5522
5523     PERL_ARGS_ASSERT_PREGCOMP;
5524
5525     /* Dispatch a request to compile a regexp to correct regexp engine. */
5526     DEBUG_COMPILE_r({
5527         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5528                         PTR2UV(eng));
5529     });
5530     return CALLREGCOMP_ENG(eng, pattern, flags);
5531 }
5532 #endif
5533
5534 /* public(ish) entry point for the perl core's own regex compiling code.
5535  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5536  * pattern rather than a list of OPs, and uses the internal engine rather
5537  * than the current one */
5538
5539 REGEXP *
5540 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5541 {
5542     SV *pat = pattern; /* defeat constness! */
5543     PERL_ARGS_ASSERT_RE_COMPILE;
5544     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5545 #ifdef PERL_IN_XSUB_RE
5546                                 &my_reg_engine,
5547 #else
5548                                 &PL_core_reg_engine,
5549 #endif
5550                                 NULL, NULL, rx_flags, 0);
5551 }
5552
5553
5554 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5555  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5556  * point to the realloced string and length.
5557  *
5558  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5559  * stuff added */
5560
5561 static void
5562 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5563                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5564 {
5565     U8 *const src = (U8*)*pat_p;
5566     U8 *dst;
5567     int n=0;
5568     STRLEN s = 0, d = 0;
5569     bool do_end = 0;
5570     GET_RE_DEBUG_FLAGS_DECL;
5571
5572     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5573         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5574
5575     Newx(dst, *plen_p * 2 + 1, U8);
5576
5577     while (s < *plen_p) {
5578         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5579             dst[d]   = src[s];
5580         else {
5581             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5582             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5583         }
5584         if (n < num_code_blocks) {
5585             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5586                 pRExC_state->code_blocks[n].start = d;
5587                 assert(dst[d] == '(');
5588                 do_end = 1;
5589             }
5590             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5591                 pRExC_state->code_blocks[n].end = d;
5592                 assert(dst[d] == ')');
5593                 do_end = 0;
5594                 n++;
5595             }
5596         }
5597         s++;
5598         d++;
5599     }
5600     dst[d] = '\0';
5601     *plen_p = d;
5602     *pat_p = (char*) dst;
5603     SAVEFREEPV(*pat_p);
5604     RExC_orig_utf8 = RExC_utf8 = 1;
5605 }
5606
5607
5608
5609 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5610  * while recording any code block indices, and handling overloading,
5611  * nested qr// objects etc.  If pat is null, it will allocate a new
5612  * string, or just return the first arg, if there's only one.
5613  *
5614  * Returns the malloced/updated pat.
5615  * patternp and pat_count is the array of SVs to be concatted;
5616  * oplist is the optional list of ops that generated the SVs;
5617  * recompile_p is a pointer to a boolean that will be set if
5618  *   the regex will need to be recompiled.
5619  * delim, if non-null is an SV that will be inserted between each element
5620  */
5621
5622 static SV*
5623 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5624                 SV *pat, SV ** const patternp, int pat_count,
5625                 OP *oplist, bool *recompile_p, SV *delim)
5626 {
5627     SV **svp;
5628     int n = 0;
5629     bool use_delim = FALSE;
5630     bool alloced = FALSE;
5631
5632     /* if we know we have at least two args, create an empty string,
5633      * then concatenate args to that. For no args, return an empty string */
5634     if (!pat && pat_count != 1) {
5635         pat = newSVpvn("", 0);
5636         SAVEFREESV(pat);
5637         alloced = TRUE;
5638     }
5639
5640     for (svp = patternp; svp < patternp + pat_count; svp++) {
5641         SV *sv;
5642         SV *rx  = NULL;
5643         STRLEN orig_patlen = 0;
5644         bool code = 0;
5645         SV *msv = use_delim ? delim : *svp;
5646         if (!msv) msv = &PL_sv_undef;
5647
5648         /* if we've got a delimiter, we go round the loop twice for each
5649          * svp slot (except the last), using the delimiter the second
5650          * time round */
5651         if (use_delim) {
5652             svp--;
5653             use_delim = FALSE;
5654         }
5655         else if (delim)
5656             use_delim = TRUE;
5657
5658         if (SvTYPE(msv) == SVt_PVAV) {
5659             /* we've encountered an interpolated array within
5660              * the pattern, e.g. /...@a..../. Expand the list of elements,
5661              * then recursively append elements.
5662              * The code in this block is based on S_pushav() */
5663
5664             AV *const av = (AV*)msv;
5665             const SSize_t maxarg = AvFILL(av) + 1;
5666             SV **array;
5667
5668             if (oplist) {
5669                 assert(oplist->op_type == OP_PADAV
5670                     || oplist->op_type == OP_RV2AV);
5671                 oplist = oplist->op_sibling;;
5672             }
5673
5674             if (SvRMAGICAL(av)) {
5675                 SSize_t i;
5676
5677                 Newx(array, maxarg, SV*);
5678                 SAVEFREEPV(array);
5679                 for (i=0; i < maxarg; i++) {
5680                     SV ** const svp = av_fetch(av, i, FALSE);
5681                     array[i] = svp ? *svp : &PL_sv_undef;
5682                 }
5683             }
5684             else
5685                 array = AvARRAY(av);
5686
5687             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5688                                 array, maxarg, NULL, recompile_p,
5689                                 /* $" */
5690                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5691
5692             continue;
5693         }
5694
5695
5696         /* we make the assumption here that each op in the list of
5697          * op_siblings maps to one SV pushed onto the stack,
5698          * except for code blocks, with have both an OP_NULL and
5699          * and OP_CONST.
5700          * This allows us to match up the list of SVs against the
5701          * list of OPs to find the next code block.
5702          *
5703          * Note that       PUSHMARK PADSV PADSV ..
5704          * is optimised to
5705          *                 PADRANGE PADSV  PADSV  ..
5706          * so the alignment still works. */
5707
5708         if (oplist) {
5709             if (oplist->op_type == OP_NULL
5710                 && (oplist->op_flags & OPf_SPECIAL))
5711             {
5712                 assert(n < pRExC_state->num_code_blocks);
5713                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5714                 pRExC_state->code_blocks[n].block = oplist;
5715                 pRExC_state->code_blocks[n].src_regex = NULL;
5716                 n++;
5717                 code = 1;
5718                 oplist = oplist->op_sibling; /* skip CONST */
5719                 assert(oplist);
5720             }
5721             oplist = oplist->op_sibling;;
5722         }
5723
5724         /* apply magic and QR overloading to arg */
5725
5726         SvGETMAGIC(msv);
5727         if (SvROK(msv) && SvAMAGIC(msv)) {
5728             SV *sv = AMG_CALLunary(msv, regexp_amg);
5729             if (sv) {
5730                 if (SvROK(sv))
5731                     sv = SvRV(sv);
5732                 if (SvTYPE(sv) != SVt_REGEXP)
5733                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5734                 msv = sv;
5735             }
5736         }
5737
5738         /* try concatenation overload ... */
5739         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5740                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5741         {
5742             sv_setsv(pat, sv);
5743             /* overloading involved: all bets are off over literal
5744              * code. Pretend we haven't seen it */
5745             pRExC_state->num_code_blocks -= n;
5746             n = 0;
5747         }
5748         else  {
5749             /* ... or failing that, try "" overload */
5750             while (SvAMAGIC(msv)
5751                     && (sv = AMG_CALLunary(msv, string_amg))
5752                     && sv != msv
5753                     &&  !(   SvROK(msv)
5754                           && SvROK(sv)
5755                           && SvRV(msv) == SvRV(sv))
5756             ) {
5757                 msv = sv;
5758                 SvGETMAGIC(msv);
5759             }
5760             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5761                 msv = SvRV(msv);
5762
5763             if (pat) {
5764                 /* this is a partially unrolled
5765                  *     sv_catsv_nomg(pat, msv);
5766                  * that allows us to adjust code block indices if
5767                  * needed */
5768                 STRLEN dlen;
5769                 char *dst = SvPV_force_nomg(pat, dlen);
5770                 orig_patlen = dlen;
5771                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5772                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5773                     sv_setpvn(pat, dst, dlen);
5774                     SvUTF8_on(pat);
5775                 }
5776                 sv_catsv_nomg(pat, msv);
5777                 rx = msv;
5778             }
5779             else
5780                 pat = msv;
5781
5782             if (code)
5783                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5784         }
5785
5786         /* extract any code blocks within any embedded qr//'s */
5787         if (rx && SvTYPE(rx) == SVt_REGEXP
5788             && RX_ENGINE((REGEXP*)rx)->op_comp)
5789         {
5790
5791             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5792             if (ri->num_code_blocks) {
5793                 int i;
5794                 /* the presence of an embedded qr// with code means
5795                  * we should always recompile: the text of the
5796                  * qr// may not have changed, but it may be a
5797                  * different closure than last time */
5798                 *recompile_p = 1;
5799                 Renew(pRExC_state->code_blocks,
5800                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5801                     struct reg_code_block);
5802                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5803
5804                 for (i=0; i < ri->num_code_blocks; i++) {
5805                     struct reg_code_block *src, *dst;
5806                     STRLEN offset =  orig_patlen
5807                         + ReANY((REGEXP *)rx)->pre_prefix;
5808                     assert(n < pRExC_state->num_code_blocks);
5809                     src = &ri->code_blocks[i];
5810                     dst = &pRExC_state->code_blocks[n];
5811                     dst->start      = src->start + offset;
5812                     dst->end        = src->end   + offset;
5813                     dst->block      = src->block;
5814                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5815                                             src->src_regex
5816                                                 ? src->src_regex
5817                                                 : (REGEXP*)rx);
5818                     n++;
5819                 }
5820             }
5821         }
5822     }
5823     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5824     if (alloced)
5825         SvSETMAGIC(pat);
5826
5827     return pat;
5828 }
5829
5830
5831
5832 /* see if there are any run-time code blocks in the pattern.
5833  * False positives are allowed */
5834
5835 static bool
5836 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5837                     char *pat, STRLEN plen)
5838 {
5839     int n = 0;
5840     STRLEN s;
5841
5842     for (s = 0; s < plen; s++) {
5843         if (n < pRExC_state->num_code_blocks
5844             && s == pRExC_state->code_blocks[n].start)
5845         {
5846             s = pRExC_state->code_blocks[n].end;
5847             n++;
5848             continue;
5849         }
5850         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5851          * positives here */
5852         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5853             (pat[s+2] == '{'
5854                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5855         )
5856             return 1;
5857     }
5858     return 0;
5859 }
5860
5861 /* Handle run-time code blocks. We will already have compiled any direct
5862  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5863  * copy of it, but with any literal code blocks blanked out and
5864  * appropriate chars escaped; then feed it into
5865  *
5866  *    eval "qr'modified_pattern'"
5867  *
5868  * For example,
5869  *
5870  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5871  *
5872  * becomes
5873  *
5874  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5875  *
5876  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5877  * and merge them with any code blocks of the original regexp.
5878  *
5879  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5880  * instead, just save the qr and return FALSE; this tells our caller that
5881  * the original pattern needs upgrading to utf8.
5882  */
5883
5884 static bool
5885 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5886     char *pat, STRLEN plen)
5887 {
5888     SV *qr;
5889
5890     GET_RE_DEBUG_FLAGS_DECL;
5891
5892     if (pRExC_state->runtime_code_qr) {
5893         /* this is the second time we've been called; this should
5894          * only happen if the main pattern got upgraded to utf8
5895          * during compilation; re-use the qr we compiled first time
5896          * round (which should be utf8 too)
5897          */
5898         qr = pRExC_state->runtime_code_qr;
5899         pRExC_state->runtime_code_qr = NULL;
5900         assert(RExC_utf8 && SvUTF8(qr));
5901     }
5902     else {
5903         int n = 0;
5904         STRLEN s;
5905         char *p, *newpat;
5906         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5907         SV *sv, *qr_ref;
5908         dSP;
5909
5910         /* determine how many extra chars we need for ' and \ escaping */
5911         for (s = 0; s < plen; s++) {
5912             if (pat[s] == '\'' || pat[s] == '\\')
5913                 newlen++;
5914         }
5915
5916         Newx(newpat, newlen, char);
5917         p = newpat;
5918         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5919
5920         for (s = 0; s < plen; s++) {
5921             if (n < pRExC_state->num_code_blocks
5922                 && s == pRExC_state->code_blocks[n].start)
5923             {
5924                 /* blank out literal code block */
5925                 assert(pat[s] == '(');
5926                 while (s <= pRExC_state->code_blocks[n].end) {
5927                     *p++ = '_';
5928                     s++;
5929                 }
5930                 s--;
5931                 n++;
5932                 continue;
5933             }
5934             if (pat[s] == '\'' || pat[s] == '\\')
5935                 *p++ = '\\';
5936             *p++ = pat[s];
5937         }
5938         *p++ = '\'';
5939         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5940             *p++ = 'x';
5941         *p++ = '\0';
5942         DEBUG_COMPILE_r({
5943             PerlIO_printf(Perl_debug_log,
5944                 "%sre-parsing pattern for runtime code:%s %s\n",
5945                 PL_colors[4],PL_colors[5],newpat);
5946         });
5947
5948         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5949         Safefree(newpat);
5950
5951         ENTER;
5952         SAVETMPS;
5953         save_re_context();
5954         PUSHSTACKi(PERLSI_REQUIRE);
5955         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5956          * parsing qr''; normally only q'' does this. It also alters
5957          * hints handling */
5958         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5959         SvREFCNT_dec_NN(sv);
5960         SPAGAIN;
5961         qr_ref = POPs;
5962         PUTBACK;
5963         {
5964             SV * const errsv = ERRSV;
5965             if (SvTRUE_NN(errsv))
5966             {
5967                 Safefree(pRExC_state->code_blocks);
5968                 /* use croak_sv ? */
5969                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5970             }
5971         }
5972         assert(SvROK(qr_ref));
5973         qr = SvRV(qr_ref);
5974         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5975         /* the leaving below frees the tmp qr_ref.
5976          * Give qr a life of its own */
5977         SvREFCNT_inc(qr);
5978         POPSTACK;
5979         FREETMPS;
5980         LEAVE;
5981
5982     }
5983
5984     if (!RExC_utf8 && SvUTF8(qr)) {
5985         /* first time through; the pattern got upgraded; save the
5986          * qr for the next time through */
5987         assert(!pRExC_state->runtime_code_qr);
5988         pRExC_state->runtime_code_qr = qr;
5989         return 0;
5990     }
5991
5992
5993     /* extract any code blocks within the returned qr//  */
5994
5995
5996     /* merge the main (r1) and run-time (r2) code blocks into one */
5997     {
5998         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5999         struct reg_code_block *new_block, *dst;
6000         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6001         int i1 = 0, i2 = 0;
6002
6003         if (!r2->num_code_blocks) /* we guessed wrong */
6004         {
6005             SvREFCNT_dec_NN(qr);
6006             return 1;
6007         }
6008
6009         Newx(new_block,
6010             r1->num_code_blocks + r2->num_code_blocks,
6011             struct reg_code_block);
6012         dst = new_block;
6013
6014         while (    i1 < r1->num_code_blocks
6015                 || i2 < r2->num_code_blocks)
6016         {
6017             struct reg_code_block *src;
6018             bool is_qr = 0;
6019
6020             if (i1 == r1->num_code_blocks) {
6021                 src = &r2->code_blocks[i2++];
6022                 is_qr = 1;
6023             }
6024             else if (i2 == r2->num_code_blocks)
6025                 src = &r1->code_blocks[i1++];
6026             else if (  r1->code_blocks[i1].start
6027                      < r2->code_blocks[i2].start)
6028             {
6029                 src = &r1->code_blocks[i1++];
6030                 assert(src->end < r2->code_blocks[i2].start);
6031             }
6032             else {
6033                 assert(  r1->code_blocks[i1].start
6034                        > r2->code_blocks[i2].start);
6035                 src = &r2->code_blocks[i2++];
6036                 is_qr = 1;
6037                 assert(src->end < r1->code_blocks[i1].start);
6038             }
6039
6040             assert(pat[src->start] == '(');
6041             assert(pat[src->end]   == ')');
6042             dst->start      = src->start;
6043             dst->end        = src->end;
6044             dst->block      = src->block;
6045             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6046                                     : src->src_regex;
6047             dst++;
6048         }
6049         r1->num_code_blocks += r2->num_code_blocks;
6050         Safefree(r1->code_blocks);
6051         r1->code_blocks = new_block;
6052     }
6053
6054     SvREFCNT_dec_NN(qr);
6055     return 1;
6056 }
6057
6058
6059 STATIC bool
6060 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6061                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6062                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6063                       STRLEN longest_length, bool eol, bool meol)
6064 {
6065     /* This is the common code for setting up the floating and fixed length
6066      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6067      * as to whether succeeded or not */
6068
6069     I32 t;
6070     SSize_t ml;
6071
6072     if (! (longest_length
6073            || (eol /* Can't have SEOL and MULTI */
6074                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6075           )
6076             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6077         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6078     {
6079         return FALSE;
6080     }
6081
6082     /* copy the information about the longest from the reg_scan_data
6083         over to the program. */
6084     if (SvUTF8(sv_longest)) {
6085         *rx_utf8 = sv_longest;
6086         *rx_substr = NULL;
6087     } else {
6088         *rx_substr = sv_longest;
6089         *rx_utf8 = NULL;
6090     }
6091     /* end_shift is how many chars that must be matched that
6092         follow this item. We calculate it ahead of time as once the
6093         lookbehind offset is added in we lose the ability to correctly
6094         calculate it.*/
6095     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6096     *rx_end_shift = ml - offset
6097         - longest_length + (SvTAIL(sv_longest) != 0)
6098         + lookbehind;
6099
6100     t = (eol/* Can't have SEOL and MULTI */
6101          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6102     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6103
6104     return TRUE;
6105 }
6106
6107 /*
6108  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6109  * regular expression into internal code.
6110  * The pattern may be passed either as:
6111  *    a list of SVs (patternp plus pat_count)
6112  *    a list of OPs (expr)
6113  * If both are passed, the SV list is used, but the OP list indicates
6114  * which SVs are actually pre-compiled code blocks
6115  *
6116  * The SVs in the list have magic and qr overloading applied to them (and
6117  * the list may be modified in-place with replacement SVs in the latter
6118  * case).
6119  *
6120  * If the pattern hasn't changed from old_re, then old_re will be
6121  * returned.
6122  *
6123  * eng is the current engine. If that engine has an op_comp method, then
6124  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6125  * do the initial concatenation of arguments and pass on to the external
6126  * engine.
6127  *
6128  * If is_bare_re is not null, set it to a boolean indicating whether the
6129  * arg list reduced (after overloading) to a single bare regex which has
6130  * been returned (i.e. /$qr/).
6131  *
6132  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6133  *
6134  * pm_flags contains the PMf_* flags, typically based on those from the
6135  * pm_flags field of the related PMOP. Currently we're only interested in
6136  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6137  *
6138  * We can't allocate space until we know how big the compiled form will be,
6139  * but we can't compile it (and thus know how big it is) until we've got a
6140  * place to put the code.  So we cheat:  we compile it twice, once with code
6141  * generation turned off and size counting turned on, and once "for real".
6142  * This also means that we don't allocate space until we are sure that the
6143  * thing really will compile successfully, and we never have to move the
6144  * code and thus invalidate pointers into it.  (Note that it has to be in
6145  * one piece because free() must be able to free it all.) [NB: not true in perl]
6146  *
6147  * Beware that the optimization-preparation code in here knows about some
6148  * of the structure of the compiled regexp.  [I'll say.]
6149  */
6150
6151 REGEXP *
6152 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6153                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6154                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6155 {
6156     dVAR;
6157     REGEXP *rx;
6158     struct regexp *r;
6159     regexp_internal *ri;
6160     STRLEN plen;
6161     char *exp;
6162     regnode *scan;
6163     I32 flags;
6164     SSize_t minlen = 0;
6165     U32 rx_flags;
6166     SV *pat;
6167     SV *code_blocksv = NULL;
6168     SV** new_patternp = patternp;
6169
6170     /* these are all flags - maybe they should be turned
6171      * into a single int with different bit masks */
6172     I32 sawlookahead = 0;
6173     I32 sawplus = 0;
6174     I32 sawopen = 0;
6175     I32 sawminmod = 0;
6176
6177     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6178     bool recompile = 0;
6179     bool runtime_code = 0;
6180     scan_data_t data;
6181     RExC_state_t RExC_state;
6182     RExC_state_t * const pRExC_state = &RExC_state;
6183 #ifdef TRIE_STUDY_OPT
6184     int restudied = 0;
6185     RExC_state_t copyRExC_state;
6186 #endif
6187     GET_RE_DEBUG_FLAGS_DECL;
6188
6189     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6190
6191     DEBUG_r(if (!PL_colorset) reginitcolors());
6192
6193 #ifndef PERL_IN_XSUB_RE
6194     /* Initialize these here instead of as-needed, as is quick and avoids
6195      * having to test them each time otherwise */
6196     if (! PL_AboveLatin1) {
6197         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6198         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6199         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6200         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6201         PL_HasMultiCharFold =
6202                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6203     }
6204 #endif
6205
6206     pRExC_state->code_blocks = NULL;
6207     pRExC_state->num_code_blocks = 0;
6208
6209     if (is_bare_re)
6210         *is_bare_re = FALSE;
6211
6212     if (expr && (expr->op_type == OP_LIST ||
6213                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6214         /* allocate code_blocks if needed */
6215         OP *o;
6216         int ncode = 0;
6217
6218         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6219             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6220                 ncode++; /* count of DO blocks */
6221         if (ncode) {
6222             pRExC_state->num_code_blocks = ncode;
6223             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6224         }
6225     }
6226
6227     if (!pat_count) {
6228         /* compile-time pattern with just OP_CONSTs and DO blocks */
6229
6230         int n;
6231         OP *o;
6232
6233         /* find how many CONSTs there are */
6234         assert(expr);
6235         n = 0;
6236         if (expr->op_type == OP_CONST)
6237             n = 1;
6238         else
6239             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6240                 if (o->op_type == OP_CONST)
6241                     n++;
6242             }
6243
6244         /* fake up an SV array */
6245
6246         assert(!new_patternp);
6247         Newx(new_patternp, n, SV*);
6248         SAVEFREEPV(new_patternp);
6249         pat_count = n;
6250
6251         n = 0;
6252         if (expr->op_type == OP_CONST)
6253             new_patternp[n] = cSVOPx_sv(expr);
6254         else
6255             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6256                 if (o->op_type == OP_CONST)
6257                     new_patternp[n++] = cSVOPo_sv;
6258             }
6259
6260     }
6261
6262     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6263         "Assembling pattern from %d elements%s\n", pat_count,
6264             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6265
6266     /* set expr to the first arg op */
6267
6268     if (pRExC_state->num_code_blocks
6269          && expr->op_type != OP_CONST)
6270     {
6271             expr = cLISTOPx(expr)->op_first;
6272             assert(   expr->op_type == OP_PUSHMARK
6273                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6274                    || expr->op_type == OP_PADRANGE);
6275             expr = expr->op_sibling;
6276     }
6277
6278     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6279                         expr, &recompile, NULL);
6280
6281     /* handle bare (possibly after overloading) regex: foo =~ $re */
6282     {
6283         SV *re = pat;
6284         if (SvROK(re))
6285             re = SvRV(re);
6286         if (SvTYPE(re) == SVt_REGEXP) {
6287             if (is_bare_re)
6288                 *is_bare_re = TRUE;
6289             SvREFCNT_inc(re);
6290             Safefree(pRExC_state->code_blocks);
6291             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6292                 "Precompiled pattern%s\n",
6293                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6294
6295             return (REGEXP*)re;
6296         }
6297     }
6298
6299     exp = SvPV_nomg(pat, plen);
6300
6301     if (!eng->op_comp) {
6302         if ((SvUTF8(pat) && IN_BYTES)
6303                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6304         {
6305             /* make a temporary copy; either to convert to bytes,
6306              * or to avoid repeating get-magic / overloaded stringify */
6307             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6308                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6309         }
6310         Safefree(pRExC_state->code_blocks);
6311         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6312     }
6313
6314     /* ignore the utf8ness if the pattern is 0 length */
6315     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6316     RExC_uni_semantics = 0;
6317     RExC_contains_locale = 0;
6318     RExC_contains_i = 0;
6319     pRExC_state->runtime_code_qr = NULL;
6320
6321     DEBUG_COMPILE_r({
6322             SV *dsv= sv_newmortal();
6323             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6324             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6325                           PL_colors[4],PL_colors[5],s);
6326         });
6327
6328   redo_first_pass:
6329     /* we jump here if we upgrade the pattern to utf8 and have to
6330      * recompile */
6331
6332     if ((pm_flags & PMf_USE_RE_EVAL)
6333                 /* this second condition covers the non-regex literal case,
6334                  * i.e.  $foo =~ '(?{})'. */
6335                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6336     )
6337         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6338
6339     /* return old regex if pattern hasn't changed */
6340     /* XXX: note in the below we have to check the flags as well as the
6341      * pattern.
6342      *
6343      * Things get a touch tricky as we have to compare the utf8 flag
6344      * independently from the compile flags.  */
6345
6346     if (   old_re
6347         && !recompile
6348         && !!RX_UTF8(old_re) == !!RExC_utf8
6349         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6350         && RX_PRECOMP(old_re)
6351         && RX_PRELEN(old_re) == plen
6352         && memEQ(RX_PRECOMP(old_re), exp, plen)
6353         && !runtime_code /* with runtime code, always recompile */ )
6354     {
6355         Safefree(pRExC_state->code_blocks);
6356         return old_re;
6357     }
6358
6359     rx_flags = orig_rx_flags;
6360
6361     if (rx_flags & PMf_FOLD) {
6362         RExC_contains_i = 1;
6363     }
6364     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6365
6366         /* Set to use unicode semantics if the pattern is in utf8 and has the
6367          * 'depends' charset specified, as it means unicode when utf8  */
6368         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6369     }
6370
6371     RExC_precomp = exp;
6372     RExC_flags = rx_flags;
6373     RExC_pm_flags = pm_flags;
6374
6375     if (runtime_code) {
6376         if (TAINTING_get && TAINT_get)
6377             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6378
6379         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6380             /* whoops, we have a non-utf8 pattern, whilst run-time code
6381              * got compiled as utf8. Try again with a utf8 pattern */
6382             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6383                                     pRExC_state->num_code_blocks);
6384             goto redo_first_pass;
6385         }
6386     }
6387     assert(!pRExC_state->runtime_code_qr);
6388
6389     RExC_sawback = 0;
6390
6391     RExC_seen = 0;
6392     RExC_maxlen = 0;
6393     RExC_in_lookbehind = 0;
6394     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6395     RExC_extralen = 0;
6396     RExC_override_recoding = 0;
6397     RExC_in_multi_char_class = 0;
6398
6399     /* First pass: determine size, legality. */
6400     RExC_parse = exp;
6401     RExC_start = exp;
6402     RExC_end = exp + plen;
6403     RExC_naughty = 0;
6404     RExC_npar = 1;
6405     RExC_nestroot = 0;
6406     RExC_size = 0L;
6407     RExC_emit = (regnode *) &RExC_emit_dummy;
6408     RExC_whilem_seen = 0;
6409     RExC_open_parens = NULL;
6410     RExC_close_parens = NULL;
6411     RExC_opend = NULL;
6412     RExC_paren_names = NULL;
6413 #ifdef DEBUGGING
6414     RExC_paren_name_list = NULL;
6415 #endif
6416     RExC_recurse = NULL;
6417     RExC_study_chunk_recursed = NULL;
6418     RExC_study_chunk_recursed_bytes= 0;
6419     RExC_recurse_count = 0;
6420     pRExC_state->code_index = 0;
6421
6422 #if 0 /* REGC() is (currently) a NOP at the first pass.
6423        * Clever compilers notice this and complain. --jhi */
6424     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6425 #endif
6426     DEBUG_PARSE_r(
6427         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6428         RExC_lastnum=0;
6429         RExC_lastparse=NULL;
6430     );
6431     /* reg may croak on us, not giving us a chance to free
6432        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6433        need it to survive as long as the regexp (qr/(?{})/).
6434        We must check that code_blocksv is not already set, because we may
6435        have jumped back to restart the sizing pass. */
6436     if (pRExC_state->code_blocks && !code_blocksv) {
6437         code_blocksv = newSV_type(SVt_PV);
6438         SAVEFREESV(code_blocksv);
6439         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6440         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6441     }
6442     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6443         /* It's possible to write a regexp in ascii that represents Unicode
6444         codepoints outside of the byte range, such as via \x{100}. If we
6445         detect such a sequence we have to convert the entire pattern to utf8
6446         and then recompile, as our sizing calculation will have been based
6447         on 1 byte == 1 character, but we will need to use utf8 to encode
6448         at least some part of the pattern, and therefore must convert the whole
6449         thing.
6450         -- dmq */
6451         if (flags & RESTART_UTF8) {
6452             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6453                                     pRExC_state->num_code_blocks);
6454             goto redo_first_pass;
6455         }
6456         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6457     }
6458     if (code_blocksv)
6459         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6460
6461     DEBUG_PARSE_r({
6462         PerlIO_printf(Perl_debug_log,
6463             "Required size %"IVdf" nodes\n"
6464             "Starting second pass (creation)\n",
6465             (IV)RExC_size);
6466         RExC_lastnum=0;
6467         RExC_lastparse=NULL;
6468     });
6469
6470     /* The first pass could have found things that force Unicode semantics */
6471     if ((RExC_utf8 || RExC_uni_semantics)
6472          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6473     {
6474         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6475     }
6476
6477     /* Small enough for pointer-storage convention?
6478        If extralen==0, this means that we will not need long jumps. */
6479     if (RExC_size >= 0x10000L && RExC_extralen)
6480         RExC_size += RExC_extralen;
6481     else
6482         RExC_extralen = 0;
6483     if (RExC_whilem_seen > 15)
6484         RExC_whilem_seen = 15;
6485
6486     /* Allocate space and zero-initialize. Note, the two step process
6487        of zeroing when in debug mode, thus anything assigned has to
6488        happen after that */
6489     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6490     r = ReANY(rx);
6491     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6492          char, regexp_internal);
6493     if ( r == NULL || ri == NULL )
6494         FAIL("Regexp out of space");
6495 #ifdef DEBUGGING
6496     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6497     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6498          char);
6499 #else
6500     /* bulk initialize base fields with 0. */
6501     Zero(ri, sizeof(regexp_internal), char);
6502 #endif
6503
6504     /* non-zero initialization begins here */
6505     RXi_SET( r, ri );
6506     r->engine= eng;
6507     r->extflags = rx_flags;
6508     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6509
6510     if (pm_flags & PMf_IS_QR) {
6511         ri->code_blocks = pRExC_state->code_blocks;
6512         ri->num_code_blocks = pRExC_state->num_code_blocks;
6513     }
6514     else
6515     {
6516         int n;
6517         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6518             if (pRExC_state->code_blocks[n].src_regex)
6519                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6520         SAVEFREEPV(pRExC_state->code_blocks);
6521     }
6522
6523     {
6524         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6525         bool has_charset = (get_regex_charset(r->extflags)
6526                                                     != REGEX_DEPENDS_CHARSET);
6527
6528         /* The caret is output if there are any defaults: if not all the STD
6529          * flags are set, or if no character set specifier is needed */
6530         bool has_default =
6531                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6532                     || ! has_charset);
6533         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6534                                                    == REG_RUN_ON_COMMENT_SEEN);
6535         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6536                             >> RXf_PMf_STD_PMMOD_SHIFT);
6537         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6538         char *p;
6539         /* Allocate for the worst case, which is all the std flags are turned
6540          * on.  If more precision is desired, we could do a population count of
6541          * the flags set.  This could be done with a small lookup table, or by
6542          * shifting, masking and adding, or even, when available, assembly
6543          * language for a machine-language population count.
6544          * We never output a minus, as all those are defaults, so are
6545          * covered by the caret */
6546         const STRLEN wraplen = plen + has_p + has_runon
6547             + has_default       /* If needs a caret */
6548
6549                 /* If needs a character set specifier */
6550             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6551             + (sizeof(STD_PAT_MODS) - 1)
6552             + (sizeof("(?:)") - 1);
6553
6554         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6555         r->xpv_len_u.xpvlenu_pv = p;
6556         if (RExC_utf8)
6557             SvFLAGS(rx) |= SVf_UTF8;
6558         *p++='('; *p++='?';
6559
6560         /* If a default, cover it using the caret */
6561         if (has_default) {
6562             *p++= DEFAULT_PAT_MOD;
6563         }
6564         if (has_charset) {
6565             STRLEN len;
6566             const char* const name = get_regex_charset_name(r->extflags, &len);
6567             Copy(name, p, len, char);
6568             p += len;
6569         }
6570         if (has_p)
6571             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6572         {
6573             char ch;
6574             while((ch = *fptr++)) {
6575                 if(reganch & 1)
6576                     *p++ = ch;
6577                 reganch >>= 1;
6578             }
6579         }
6580
6581         *p++ = ':';
6582         Copy(RExC_precomp, p, plen, char);
6583         assert ((RX_WRAPPED(rx) - p) < 16);
6584         r->pre_prefix = p - RX_WRAPPED(rx);
6585         p += plen;
6586         if (has_runon)
6587             *p++ = '\n';
6588         *p++ = ')';
6589         *p = 0;
6590         SvCUR_set(rx, p - RX_WRAPPED(rx));
6591     }
6592
6593     r->intflags = 0;
6594     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6595
6596     /* setup various meta data about recursion, this all requires
6597      * RExC_npar to be correctly set, and a bit later on we clear it */
6598     if (RExC_seen & REG_RECURSE_SEEN) {
6599         Newxz(RExC_open_parens, RExC_npar,regnode *);
6600         SAVEFREEPV(RExC_open_parens);
6601         Newxz(RExC_close_parens,RExC_npar,regnode *);
6602         SAVEFREEPV(RExC_close_parens);
6603     }
6604     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6605         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6606          * So its 1 if there are no parens. */
6607         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6608                                          ((RExC_npar & 0x07) != 0);
6609         Newx(RExC_study_chunk_recursed,
6610              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6611         SAVEFREEPV(RExC_study_chunk_recursed);
6612     }
6613
6614     /* Useful during FAIL. */
6615 #ifdef RE_TRACK_PATTERN_OFFSETS
6616     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6617     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6618                           "%s %"UVuf" bytes for offset annotations.\n",
6619                           ri->u.offsets ? "Got" : "Couldn't get",
6620                           (UV)((2*RExC_size+1) * sizeof(U32))));
6621 #endif
6622     SetProgLen(ri,RExC_size);
6623     RExC_rx_sv = rx;
6624     RExC_rx = r;
6625     RExC_rxi = ri;
6626
6627     /* Second pass: emit code. */
6628     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6629     RExC_pm_flags = pm_flags;
6630     RExC_parse = exp;
6631     RExC_end = exp + plen;
6632     RExC_naughty = 0;
6633     RExC_npar = 1;
6634     RExC_emit_start = ri->program;
6635     RExC_emit = ri->program;
6636     RExC_emit_bound = ri->program + RExC_size + 1;
6637     pRExC_state->code_index = 0;
6638
6639     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6640     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6641         ReREFCNT_dec(rx);
6642         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6643     }
6644     /* XXXX To minimize changes to RE engine we always allocate
6645        3-units-long substrs field. */
6646     Newx(r->substrs, 1, struct reg_substr_data);
6647     if (RExC_recurse_count) {
6648         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6649         SAVEFREEPV(RExC_recurse);
6650     }
6651
6652 reStudy:
6653     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6654     Zero(r->substrs, 1, struct reg_substr_data);
6655     if (RExC_study_chunk_recursed)
6656         Zero(RExC_study_chunk_recursed,
6657              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6658
6659 #ifdef TRIE_STUDY_OPT
6660     if (!restudied) {
6661         StructCopy(&zero_scan_data, &data, scan_data_t);
6662         copyRExC_state = RExC_state;
6663     } else {
6664         U32 seen=RExC_seen;
6665         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6666
6667         RExC_state = copyRExC_state;
6668         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6669             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6670         else
6671             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6672         StructCopy(&zero_scan_data, &data, scan_data_t);
6673     }
6674 #else
6675     StructCopy(&zero_scan_data, &data, scan_data_t);
6676 #endif
6677
6678     /* Dig out information for optimizations. */
6679     r->extflags = RExC_flags; /* was pm_op */
6680     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6681
6682     if (UTF)
6683         SvUTF8_on(rx);  /* Unicode in it? */
6684     ri->regstclass = NULL;
6685     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6686         r->intflags |= PREGf_NAUGHTY;
6687     scan = ri->program + 1;             /* First BRANCH. */
6688
6689     /* testing for BRANCH here tells us whether there is "must appear"
6690        data in the pattern. If there is then we can use it for optimisations */
6691     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6692                                                   */
6693         SSize_t fake;
6694         STRLEN longest_float_length, longest_fixed_length;
6695         regnode_ssc ch_class; /* pointed to by data */
6696         int stclass_flag;
6697         SSize_t last_close = 0; /* pointed to by data */
6698         regnode *first= scan;
6699         regnode *first_next= regnext(first);
6700         /*
6701          * Skip introductions and multiplicators >= 1
6702          * so that we can extract the 'meat' of the pattern that must
6703          * match in the large if() sequence following.
6704          * NOTE that EXACT is NOT covered here, as it is normally
6705          * picked up by the optimiser separately.
6706          *
6707          * This is unfortunate as the optimiser isnt handling lookahead
6708          * properly currently.
6709          *
6710          */
6711         while ((OP(first) == OPEN && (sawopen = 1)) ||
6712                /* An OR of *one* alternative - should not happen now. */
6713             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6714             /* for now we can't handle lookbehind IFMATCH*/
6715             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6716             (OP(first) == PLUS) ||
6717             (OP(first) == MINMOD) ||
6718                /* An {n,m} with n>0 */
6719             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6720             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6721         {
6722                 /*
6723                  * the only op that could be a regnode is PLUS, all the rest
6724                  * will be regnode_1 or regnode_2.
6725                  *
6726                  * (yves doesn't think this is true)
6727                  */
6728                 if (OP(first) == PLUS)
6729                     sawplus = 1;
6730                 else {
6731                     if (OP(first) == MINMOD)
6732                         sawminmod = 1;
6733                     first += regarglen[OP(first)];
6734                 }
6735                 first = NEXTOPER(first);
6736                 first_next= regnext(first);
6737         }
6738
6739         /* Starting-point info. */
6740       again:
6741         DEBUG_PEEP("first:",first,0);
6742         /* Ignore EXACT as we deal with it later. */
6743         if (PL_regkind[OP(first)] == EXACT) {
6744             if (OP(first) == EXACT)
6745                 NOOP;   /* Empty, get anchored substr later. */
6746             else
6747                 ri->regstclass = first;
6748         }
6749 #ifdef TRIE_STCLASS
6750         else if (PL_regkind[OP(first)] == TRIE &&
6751                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6752         {
6753             regnode *trie_op;
6754             /* this can happen only on restudy */
6755             if ( OP(first) == TRIE ) {
6756                 struct regnode_1 *trieop = (struct regnode_1 *)
6757                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6758                 StructCopy(first,trieop,struct regnode_1);
6759                 trie_op=(regnode *)trieop;
6760             } else {
6761                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6762                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6763                 StructCopy(first,trieop,struct regnode_charclass);
6764                 trie_op=(regnode *)trieop;
6765             }
6766             OP(trie_op)+=2;
6767             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6768             ri->regstclass = trie_op;
6769         }
6770 #endif
6771         else if (REGNODE_SIMPLE(OP(first)))
6772             ri->regstclass = first;
6773         else if (PL_regkind[OP(first)] == BOUND ||
6774                  PL_regkind[OP(first)] == NBOUND)
6775             ri->regstclass = first;
6776         else if (PL_regkind[OP(first)] == BOL) {
6777             r->intflags |= (OP(first) == MBOL
6778                            ? PREGf_ANCH_MBOL
6779                            : (OP(first) == SBOL
6780                               ? PREGf_ANCH_SBOL
6781                               : PREGf_ANCH_BOL));
6782             first = NEXTOPER(first);
6783             goto again;
6784         }
6785         else if (OP(first) == GPOS) {
6786             r->intflags |= PREGf_ANCH_GPOS;
6787             first = NEXTOPER(first);
6788             goto again;
6789         }
6790         else if ((!sawopen || !RExC_sawback) &&
6791             (OP(first) == STAR &&
6792             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6793             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6794         {
6795             /* turn .* into ^.* with an implied $*=1 */
6796             const int type =
6797                 (OP(NEXTOPER(first)) == REG_ANY)
6798                     ? PREGf_ANCH_MBOL
6799                     : PREGf_ANCH_SBOL;
6800             r->intflags |= (type | PREGf_IMPLICIT);
6801             first = NEXTOPER(first);
6802             goto again;
6803         }
6804         if (sawplus && !sawminmod && !sawlookahead
6805             && (!sawopen || !RExC_sawback)
6806             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6807             /* x+ must match at the 1st pos of run of x's */
6808             r->intflags |= PREGf_SKIP;
6809
6810         /* Scan is after the zeroth branch, first is atomic matcher. */
6811 #ifdef TRIE_STUDY_OPT
6812         DEBUG_PARSE_r(
6813             if (!restudied)
6814                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6815                               (IV)(first - scan + 1))
6816         );
6817 #else
6818         DEBUG_PARSE_r(
6819             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6820                 (IV)(first - scan + 1))
6821         );
6822 #endif
6823
6824
6825         /*
6826         * If there's something expensive in the r.e., find the
6827         * longest literal string that must appear and make it the
6828         * regmust.  Resolve ties in favor of later strings, since
6829         * the regstart check works with the beginning of the r.e.
6830         * and avoiding duplication strengthens checking.  Not a
6831         * strong reason, but sufficient in the absence of others.
6832         * [Now we resolve ties in favor of the earlier string if
6833         * it happens that c_offset_min has been invalidated, since the
6834         * earlier string may buy us something the later one won't.]
6835         */
6836
6837         data.longest_fixed = newSVpvs("");
6838         data.longest_float = newSVpvs("");
6839         data.last_found = newSVpvs("");
6840         data.longest = &(data.longest_fixed);
6841         ENTER_with_name("study_chunk");
6842         SAVEFREESV(data.longest_fixed);
6843         SAVEFREESV(data.longest_float);
6844         SAVEFREESV(data.last_found);
6845         first = scan;
6846         if (!ri->regstclass) {
6847             ssc_init(pRExC_state, &ch_class);
6848             data.start_class = &ch_class;
6849             stclass_flag = SCF_DO_STCLASS_AND;
6850         } else                          /* XXXX Check for BOUND? */
6851             stclass_flag = 0;
6852         data.last_closep = &last_close;
6853
6854         DEBUG_RExC_seen();
6855         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6856                              scan + RExC_size, /* Up to end */
6857             &data, -1, 0, NULL,
6858             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6859                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6860             0);
6861
6862
6863         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6864
6865
6866         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6867              && data.last_start_min == 0 && data.last_end > 0
6868              && !RExC_seen_zerolen
6869              && !(RExC_seen & REG_VERBARG_SEEN)
6870              && !(RExC_seen & REG_GPOS_SEEN)
6871         ){
6872             r->extflags |= RXf_CHECK_ALL;
6873         }
6874         scan_commit(pRExC_state, &data,&minlen,0);
6875
6876         longest_float_length = CHR_SVLEN(data.longest_float);
6877
6878         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6879                    && data.offset_fixed == data.offset_float_min
6880                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6881             && S_setup_longest (aTHX_ pRExC_state,
6882                                     data.longest_float,
6883                                     &(r->float_utf8),
6884                                     &(r->float_substr),
6885                                     &(r->float_end_shift),
6886                                     data.lookbehind_float,
6887                                     data.offset_float_min,
6888                                     data.minlen_float,
6889                                     longest_float_length,
6890                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6891                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6892         {
6893             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6894             r->float_max_offset = data.offset_float_max;
6895             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6896                 r->float_max_offset -= data.lookbehind_float;
6897             SvREFCNT_inc_simple_void_NN(data.longest_float);
6898         }
6899         else {
6900             r->float_substr = r->float_utf8 = NULL;
6901             longest_float_length = 0;
6902         }
6903
6904         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6905
6906         if (S_setup_longest (aTHX_ pRExC_state,
6907                                 data.longest_fixed,
6908                                 &(r->anchored_utf8),
6909                                 &(r->anchored_substr),
6910                                 &(r->anchored_end_shift),
6911                                 data.lookbehind_fixed,
6912                                 data.offset_fixed,
6913                                 data.minlen_fixed,
6914                                 longest_fixed_length,
6915                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6916                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6917         {
6918             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6919             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6920         }
6921         else {
6922             r->anchored_substr = r->anchored_utf8 = NULL;
6923             longest_fixed_length = 0;
6924         }
6925         LEAVE_with_name("study_chunk");
6926
6927         if (ri->regstclass
6928             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6929             ri->regstclass = NULL;
6930
6931         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6932             && stclass_flag
6933             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6934             && !ssc_is_anything(data.start_class))
6935         {
6936             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6937
6938             ssc_finalize(pRExC_state, data.start_class);
6939
6940             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6941             StructCopy(data.start_class,
6942                        (regnode_ssc*)RExC_rxi->data->data[n],
6943                        regnode_ssc);
6944             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6945             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6946             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6947                       regprop(r, sv, (regnode*)data.start_class, NULL);
6948                       PerlIO_printf(Perl_debug_log,
6949                                     "synthetic stclass \"%s\".\n",
6950                                     SvPVX_const(sv));});
6951             data.start_class = NULL;
6952         }
6953
6954         /* A temporary algorithm prefers floated substr to fixed one to dig
6955          * more info. */
6956         if (longest_fixed_length > longest_float_length) {
6957             r->substrs->check_ix = 0;
6958             r->check_end_shift = r->anchored_end_shift;
6959             r->check_substr = r->anchored_substr;
6960             r->check_utf8 = r->anchored_utf8;
6961             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6962             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6963                 r->intflags |= PREGf_NOSCAN;
6964         }
6965         else {
6966             r->substrs->check_ix = 1;
6967             r->check_end_shift = r->float_end_shift;
6968             r->check_substr = r->float_substr;
6969             r->check_utf8 = r->float_utf8;
6970             r->check_offset_min = r->float_min_offset;
6971             r->check_offset_max = r->float_max_offset;
6972         }
6973         if ((r->check_substr || r->check_utf8) ) {
6974             r->extflags |= RXf_USE_INTUIT;
6975             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6976                 r->extflags |= RXf_INTUIT_TAIL;
6977         }
6978         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6979
6980         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6981         if ( (STRLEN)minlen < longest_float_length )
6982             minlen= longest_float_length;
6983         if ( (STRLEN)minlen < longest_fixed_length )
6984             minlen= longest_fixed_length;
6985         */
6986     }
6987     else {
6988         /* Several toplevels. Best we can is to set minlen. */
6989         SSize_t fake;
6990         regnode_ssc ch_class;
6991         SSize_t last_close = 0;
6992
6993         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6994
6995         scan = ri->program + 1;
6996         ssc_init(pRExC_state, &ch_class);
6997         data.start_class = &ch_class;
6998         data.last_closep = &last_close;
6999
7000         DEBUG_RExC_seen();
7001         minlen = study_chunk(pRExC_state,
7002             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7003             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7004                                                       ? SCF_TRIE_DOING_RESTUDY
7005                                                       : 0),
7006             0);
7007
7008         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7009
7010         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7011                 = r->float_substr = r->float_utf8 = NULL;
7012
7013         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7014             && ! ssc_is_anything(data.start_class))
7015         {
7016             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7017
7018             ssc_finalize(pRExC_state, data.start_class);
7019
7020             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7021             StructCopy(data.start_class,
7022                        (regnode_ssc*)RExC_rxi->data->data[n],
7023                        regnode_ssc);
7024             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7025             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7026             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7027                       regprop(r, sv, (regnode*)data.start_class, NULL);
7028                       PerlIO_printf(Perl_debug_log,
7029                                     "synthetic stclass \"%s\".\n",
7030                                     SvPVX_const(sv));});
7031             data.start_class = NULL;
7032         }
7033     }
7034
7035     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7036         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7037         r->maxlen = REG_INFTY;
7038     }
7039     else {
7040         r->maxlen = RExC_maxlen;
7041     }
7042
7043     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7044        the "real" pattern. */
7045     DEBUG_OPTIMISE_r({
7046         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7047                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7048     });
7049     r->minlenret = minlen;
7050     if (r->minlen < minlen)
7051         r->minlen = minlen;
7052
7053     if (RExC_seen & REG_GPOS_SEEN)
7054         r->intflags |= PREGf_GPOS_SEEN;
7055     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7056         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7057                                                 lookbehind */
7058     if (pRExC_state->num_code_blocks)
7059         r->extflags |= RXf_EVAL_SEEN;
7060     if (RExC_seen & REG_CANY_SEEN)
7061         r->intflags |= PREGf_CANY_SEEN;
7062     if (RExC_seen & REG_VERBARG_SEEN)
7063     {
7064         r->intflags |= PREGf_VERBARG_SEEN;
7065         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7066     }
7067     if (RExC_seen & REG_CUTGROUP_SEEN)
7068         r->intflags |= PREGf_CUTGROUP_SEEN;
7069     if (pm_flags & PMf_USE_RE_EVAL)
7070         r->intflags |= PREGf_USE_RE_EVAL;
7071     if (RExC_paren_names)
7072         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7073     else
7074         RXp_PAREN_NAMES(r) = NULL;
7075
7076     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7077      * so it can be used in pp.c */
7078     if (r->intflags & PREGf_ANCH)
7079         r->extflags |= RXf_IS_ANCHORED;
7080
7081
7082     {
7083         /* this is used to identify "special" patterns that might result
7084          * in Perl NOT calling the regex engine and instead doing the match "itself",
7085          * particularly special cases in split//. By having the regex compiler
7086          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7087          * we avoid weird issues with equivalent patterns resulting in different behavior,
7088          * AND we allow non Perl engines to get the same optimizations by the setting the
7089          * flags appropriately - Yves */
7090         regnode *first = ri->program + 1;
7091         U8 fop = OP(first);
7092         regnode *next = NEXTOPER(first);
7093         U8 nop = OP(next);
7094
7095         if (PL_regkind[fop] == NOTHING && nop == END)
7096             r->extflags |= RXf_NULL;
7097         else if (PL_regkind[fop] == BOL && nop == END)
7098             r->extflags |= RXf_START_ONLY;
7099         else if (fop == PLUS
7100                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7101                  && OP(regnext(first)) == END)
7102             r->extflags |= RXf_WHITE;
7103         else if ( r->extflags & RXf_SPLIT
7104                   && fop == EXACT
7105                   && STR_LEN(first) == 1
7106                   && *(STRING(first)) == ' '
7107                   && OP(regnext(first)) == END )
7108             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7109
7110     }
7111
7112     if (RExC_contains_locale) {
7113         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7114     }
7115
7116 #ifdef DEBUGGING
7117     if (RExC_paren_names) {
7118         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7119         ri->data->data[ri->name_list_idx]
7120                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7121     } else
7122 #endif
7123         ri->name_list_idx = 0;
7124
7125     if (RExC_recurse_count) {
7126         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7127             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7128             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7129         }
7130     }
7131     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7132     /* assume we don't need to swap parens around before we match */
7133
7134     DEBUG_DUMP_r({
7135         DEBUG_RExC_seen();
7136         PerlIO_printf(Perl_debug_log,"Final program:\n");
7137         regdump(r);
7138     });
7139 #ifdef RE_TRACK_PATTERN_OFFSETS
7140     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7141         const STRLEN len = ri->u.offsets[0];
7142         STRLEN i;
7143         GET_RE_DEBUG_FLAGS_DECL;
7144         PerlIO_printf(Perl_debug_log,
7145                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7146         for (i = 1; i <= len; i++) {
7147             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7148                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7149                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7150             }
7151         PerlIO_printf(Perl_debug_log, "\n");
7152     });
7153 #endif
7154
7155 #ifdef USE_ITHREADS
7156     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7157      * by setting the regexp SV to readonly-only instead. If the
7158      * pattern's been recompiled, the USEDness should remain. */
7159     if (old_re && SvREADONLY(old_re))
7160         SvREADONLY_on(rx);
7161 #endif
7162     return rx;
7163 }
7164
7165
7166 SV*
7167 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7168                     const U32 flags)
7169 {
7170     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7171
7172     PERL_UNUSED_ARG(value);
7173
7174     if (flags & RXapif_FETCH) {
7175         return reg_named_buff_fetch(rx, key, flags);
7176     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7177         Perl_croak_no_modify();
7178         return NULL;
7179     } else if (flags & RXapif_EXISTS) {
7180         return reg_named_buff_exists(rx, key, flags)
7181             ? &PL_sv_yes
7182             : &PL_sv_no;
7183     } else if (flags & RXapif_REGNAMES) {
7184         return reg_named_buff_all(rx, flags);
7185     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7186         return reg_named_buff_scalar(rx, flags);
7187     } else {
7188         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7189         return NULL;
7190     }
7191 }
7192
7193 SV*
7194 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7195                          const U32 flags)
7196 {
7197     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7198     PERL_UNUSED_ARG(lastkey);
7199
7200     if (flags & RXapif_FIRSTKEY)
7201         return reg_named_buff_firstkey(rx, flags);
7202     else if (flags & RXapif_NEXTKEY)
7203         return reg_named_buff_nextkey(rx, flags);
7204     else {
7205         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7206                                             (int)flags);
7207         return NULL;
7208     }
7209 }
7210
7211 SV*
7212 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7213                           const U32 flags)
7214 {
7215     AV *retarray = NULL;
7216     SV *ret;
7217     struct regexp *const rx = ReANY(r);
7218
7219     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7220
7221     if (flags & RXapif_ALL)
7222         retarray=newAV();
7223
7224     if (rx && RXp_PAREN_NAMES(rx)) {
7225         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7226         if (he_str) {
7227             IV i;
7228             SV* sv_dat=HeVAL(he_str);
7229             I32 *nums=(I32*)SvPVX(sv_dat);
7230             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7231                 if ((I32)(rx->nparens) >= nums[i]
7232                     && rx->offs[nums[i]].start != -1
7233                     && rx->offs[nums[i]].end != -1)
7234                 {
7235                     ret = newSVpvs("");
7236                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7237                     if (!retarray)
7238                         return ret;
7239                 } else {
7240                     if (retarray)
7241                         ret = newSVsv(&PL_sv_undef);
7242                 }
7243                 if (retarray)
7244                     av_push(retarray, ret);
7245             }
7246             if (retarray)
7247                 return newRV_noinc(MUTABLE_SV(retarray));
7248         }
7249     }
7250     return NULL;
7251 }
7252
7253 bool
7254 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7255                            const U32 flags)
7256 {
7257     struct regexp *const rx = ReANY(r);
7258
7259     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7260
7261     if (rx && RXp_PAREN_NAMES(rx)) {
7262         if (flags & RXapif_ALL) {
7263             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7264         } else {
7265             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7266             if (sv) {
7267                 SvREFCNT_dec_NN(sv);
7268                 return TRUE;
7269             } else {
7270                 return FALSE;
7271             }
7272         }
7273     } else {
7274         return FALSE;
7275     }
7276 }
7277
7278 SV*
7279 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7280 {
7281     struct regexp *const rx = ReANY(r);
7282
7283     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7284
7285     if ( rx && RXp_PAREN_NAMES(rx) ) {
7286         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7287
7288         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7289     } else {
7290         return FALSE;
7291     }
7292 }
7293
7294 SV*
7295 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7296 {
7297     struct regexp *const rx = ReANY(r);
7298     GET_RE_DEBUG_FLAGS_DECL;
7299
7300     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7301
7302     if (rx && RXp_PAREN_NAMES(rx)) {
7303         HV *hv = RXp_PAREN_NAMES(rx);
7304         HE *temphe;
7305         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7306             IV i;
7307             IV parno = 0;
7308             SV* sv_dat = HeVAL(temphe);
7309             I32 *nums = (I32*)SvPVX(sv_dat);
7310             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7311                 if ((I32)(rx->lastparen) >= nums[i] &&
7312                     rx->offs[nums[i]].start != -1 &&
7313                     rx->offs[nums[i]].end != -1)
7314                 {
7315                     parno = nums[i];
7316                     break;
7317                 }
7318             }
7319             if (parno || flags & RXapif_ALL) {
7320                 return newSVhek(HeKEY_hek(temphe));
7321             }
7322         }
7323     }
7324     return NULL;
7325 }
7326
7327 SV*
7328 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7329 {
7330     SV *ret;
7331     AV *av;
7332     SSize_t length;
7333     struct regexp *const rx = ReANY(r);
7334
7335     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7336
7337     if (rx && RXp_PAREN_NAMES(rx)) {
7338         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7339             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7340         } else if (flags & RXapif_ONE) {
7341             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7342             av = MUTABLE_AV(SvRV(ret));
7343             length = av_tindex(av);
7344             SvREFCNT_dec_NN(ret);
7345             return newSViv(length + 1);
7346         } else {
7347             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7348                                                 (int)flags);
7349             return NULL;
7350         }
7351     }
7352     return &PL_sv_undef;
7353 }
7354
7355 SV*
7356 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7357 {
7358     struct regexp *const rx = ReANY(r);
7359     AV *av = newAV();
7360
7361     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7362
7363     if (rx && RXp_PAREN_NAMES(rx)) {
7364         HV *hv= RXp_PAREN_NAMES(rx);
7365         HE *temphe;
7366         (void)hv_iterinit(hv);
7367         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7368             IV i;
7369             IV parno = 0;
7370             SV* sv_dat = HeVAL(temphe);
7371             I32 *nums = (I32*)SvPVX(sv_dat);
7372             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7373                 if ((I32)(rx->lastparen) >= nums[i] &&
7374                     rx->offs[nums[i]].start != -1 &&
7375                     rx->offs[nums[i]].end != -1)
7376                 {
7377                     parno = nums[i];
7378                     break;
7379                 }
7380             }
7381             if (parno || flags & RXapif_ALL) {
7382                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7383             }
7384         }
7385     }
7386
7387     return newRV_noinc(MUTABLE_SV(av));
7388 }
7389
7390 void
7391 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7392                              SV * const sv)
7393 {
7394     struct regexp *const rx = ReANY(r);
7395     char *s = NULL;
7396     SSize_t i = 0;
7397     SSize_t s1, t1;
7398     I32 n = paren;
7399
7400     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7401
7402     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7403            || n == RX_BUFF_IDX_CARET_FULLMATCH
7404            || n == RX_BUFF_IDX_CARET_POSTMATCH
7405        )
7406     {
7407         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7408         if (!keepcopy) {
7409             /* on something like
7410              *    $r = qr/.../;
7411              *    /$qr/p;
7412              * the KEEPCOPY is set on the PMOP rather than the regex */
7413             if (PL_curpm && r == PM_GETRE(PL_curpm))
7414                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7415         }
7416         if (!keepcopy)
7417             goto ret_undef;
7418     }
7419
7420     if (!rx->subbeg)
7421         goto ret_undef;
7422
7423     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7424         /* no need to distinguish between them any more */
7425         n = RX_BUFF_IDX_FULLMATCH;
7426
7427     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7428         && rx->offs[0].start != -1)
7429     {
7430         /* $`, ${^PREMATCH} */
7431         i = rx->offs[0].start;
7432         s = rx->subbeg;
7433     }
7434     else
7435     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7436         && rx->offs[0].end != -1)
7437     {
7438         /* $', ${^POSTMATCH} */
7439         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7440         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7441     }
7442     else
7443     if ( 0 <= n && n <= (I32)rx->nparens &&
7444         (s1 = rx->offs[n].start) != -1 &&
7445         (t1 = rx->offs[n].end) != -1)
7446     {
7447         /* $&, ${^MATCH},  $1 ... */
7448         i = t1 - s1;
7449         s = rx->subbeg + s1 - rx->suboffset;
7450     } else {
7451         goto ret_undef;
7452     }
7453
7454     assert(s >= rx->subbeg);
7455     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7456     if (i >= 0) {
7457 #ifdef NO_TAINT_SUPPORT
7458         sv_setpvn(sv, s, i);
7459 #else
7460         const int oldtainted = TAINT_get;
7461         TAINT_NOT;
7462         sv_setpvn(sv, s, i);
7463         TAINT_set(oldtainted);
7464 #endif
7465         if ( (rx->intflags & PREGf_CANY_SEEN)
7466             ? (RXp_MATCH_UTF8(rx)
7467                         && (!i || is_utf8_string((U8*)s, i)))
7468             : (RXp_MATCH_UTF8(rx)) )
7469         {
7470             SvUTF8_on(sv);
7471         }
7472         else
7473             SvUTF8_off(sv);
7474         if (TAINTING_get) {
7475             if (RXp_MATCH_TAINTED(rx)) {
7476                 if (SvTYPE(sv) >= SVt_PVMG) {
7477                     MAGIC* const mg = SvMAGIC(sv);
7478                     MAGIC* mgt;
7479                     TAINT;
7480                     SvMAGIC_set(sv, mg->mg_moremagic);
7481                     SvTAINT(sv);
7482                     if ((mgt = SvMAGIC(sv))) {
7483                         mg->mg_moremagic = mgt;
7484                         SvMAGIC_set(sv, mg);
7485                     }
7486                 } else {
7487                     TAINT;
7488                     SvTAINT(sv);
7489                 }
7490             } else
7491                 SvTAINTED_off(sv);
7492         }
7493     } else {
7494       ret_undef:
7495         sv_setsv(sv,&PL_sv_undef);
7496         return;
7497     }
7498 }
7499
7500 void
7501 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7502                                                          SV const * const value)
7503 {
7504     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7505
7506     PERL_UNUSED_ARG(rx);
7507     PERL_UNUSED_ARG(paren);
7508     PERL_UNUSED_ARG(value);
7509
7510     if (!PL_localizing)
7511         Perl_croak_no_modify();
7512 }
7513
7514 I32
7515 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7516                               const I32 paren)
7517 {
7518     struct regexp *const rx = ReANY(r);
7519     I32 i;
7520     I32 s1, t1;
7521
7522     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7523
7524     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7525         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7526         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7527     )
7528     {
7529         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7530         if (!keepcopy) {
7531             /* on something like
7532              *    $r = qr/.../;
7533              *    /$qr/p;
7534              * the KEEPCOPY is set on the PMOP rather than the regex */
7535             if (PL_curpm && r == PM_GETRE(PL_curpm))
7536                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7537         }
7538         if (!keepcopy)
7539             goto warn_undef;
7540     }
7541
7542     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7543     switch (paren) {
7544       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7545       case RX_BUFF_IDX_PREMATCH:       /* $` */
7546         if (rx->offs[0].start != -1) {
7547                         i = rx->offs[0].start;
7548                         if (i > 0) {
7549                                 s1 = 0;
7550                                 t1 = i;
7551                                 goto getlen;
7552                         }
7553             }
7554         return 0;
7555
7556       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7557       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7558             if (rx->offs[0].end != -1) {
7559                         i = rx->sublen - rx->offs[0].end;
7560                         if (i > 0) {
7561                                 s1 = rx->offs[0].end;
7562                                 t1 = rx->sublen;
7563                                 goto getlen;
7564                         }
7565             }
7566         return 0;
7567
7568       default: /* $& / ${^MATCH}, $1, $2, ... */
7569             if (paren <= (I32)rx->nparens &&
7570             (s1 = rx->offs[paren].start) != -1 &&
7571             (t1 = rx->offs[paren].end) != -1)
7572             {
7573             i = t1 - s1;
7574             goto getlen;
7575         } else {
7576           warn_undef:
7577             if (ckWARN(WARN_UNINITIALIZED))
7578                 report_uninit((const SV *)sv);
7579             return 0;
7580         }
7581     }
7582   getlen:
7583     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7584         const char * const s = rx->subbeg - rx->suboffset + s1;
7585         const U8 *ep;
7586         STRLEN el;
7587
7588         i = t1 - s1;
7589         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7590                         i = el;
7591     }
7592     return i;
7593 }
7594
7595 SV*
7596 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7597 {
7598     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7599         PERL_UNUSED_ARG(rx);
7600         if (0)
7601             return NULL;
7602         else
7603             return newSVpvs("Regexp");
7604 }
7605
7606 /* Scans the name of a named buffer from the pattern.
7607  * If flags is REG_RSN_RETURN_NULL returns null.
7608  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7609  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7610  * to the parsed name as looked up in the RExC_paren_names hash.
7611  * If there is an error throws a vFAIL().. type exception.
7612  */
7613
7614 #define REG_RSN_RETURN_NULL    0
7615 #define REG_RSN_RETURN_NAME    1
7616 #define REG_RSN_RETURN_DATA    2
7617
7618 STATIC SV*
7619 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7620 {
7621     char *name_start = RExC_parse;
7622
7623     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7624
7625     assert (RExC_parse <= RExC_end);
7626     if (RExC_parse == RExC_end) NOOP;
7627     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7628          /* skip IDFIRST by using do...while */
7629         if (UTF)
7630             do {
7631                 RExC_parse += UTF8SKIP(RExC_parse);
7632             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7633         else
7634             do {
7635                 RExC_parse++;
7636             } while (isWORDCHAR(*RExC_parse));
7637     } else {
7638         RExC_parse++; /* so the <- from the vFAIL is after the offending
7639                          character */
7640         vFAIL("Group name must start with a non-digit word character");
7641     }
7642     if ( flags ) {
7643         SV* sv_name
7644             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7645                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7646         if ( flags == REG_RSN_RETURN_NAME)
7647             return sv_name;
7648         else if (flags==REG_RSN_RETURN_DATA) {
7649             HE *he_str = NULL;
7650             SV *sv_dat = NULL;
7651             if ( ! sv_name )      /* should not happen*/
7652                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7653             if (RExC_paren_names)
7654                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7655             if ( he_str )
7656                 sv_dat = HeVAL(he_str);
7657             if ( ! sv_dat )
7658                 vFAIL("Reference to nonexistent named group");
7659             return sv_dat;
7660         }
7661         else {
7662             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7663                        (unsigned long) flags);
7664         }
7665         assert(0); /* NOT REACHED */
7666     }
7667     return NULL;
7668 }
7669
7670 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7671     int rem=(int)(RExC_end - RExC_parse);                       \
7672     int cut;                                                    \
7673     int num;                                                    \
7674     int iscut=0;                                                \
7675     if (rem>10) {                                               \
7676         rem=10;                                                 \
7677         iscut=1;                                                \
7678     }                                                           \
7679     cut=10-rem;                                                 \
7680     if (RExC_lastparse!=RExC_parse)                             \
7681         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7682             rem, RExC_parse,                                    \
7683             cut + 4,                                            \
7684             iscut ? "..." : "<"                                 \
7685         );                                                      \
7686     else                                                        \
7687         PerlIO_printf(Perl_debug_log,"%16s","");                \
7688                                                                 \
7689     if (SIZE_ONLY)                                              \
7690        num = RExC_size + 1;                                     \
7691     else                                                        \
7692        num=REG_NODE_NUM(RExC_emit);                             \
7693     if (RExC_lastnum!=num)                                      \
7694        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7695     else                                                        \
7696        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7697     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7698         (int)((depth*2)), "",                                   \
7699         (funcname)                                              \
7700     );                                                          \
7701     RExC_lastnum=num;                                           \
7702     RExC_lastparse=RExC_parse;                                  \
7703 })
7704
7705
7706
7707 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7708     DEBUG_PARSE_MSG((funcname));                            \
7709     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7710 })
7711 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7712     DEBUG_PARSE_MSG((funcname));                            \
7713     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7714 })
7715
7716 /* This section of code defines the inversion list object and its methods.  The
7717  * interfaces are highly subject to change, so as much as possible is static to
7718  * this file.  An inversion list is here implemented as a malloc'd C UV array
7719  * as an SVt_INVLIST scalar.
7720  *
7721  * An inversion list for Unicode is an array of code points, sorted by ordinal
7722  * number.  The zeroth element is the first code point in the list.  The 1th
7723  * element is the first element beyond that not in the list.  In other words,
7724  * the first range is
7725  *  invlist[0]..(invlist[1]-1)
7726  * The other ranges follow.  Thus every element whose index is divisible by two
7727  * marks the beginning of a range that is in the list, and every element not
7728  * divisible by two marks the beginning of a range not in the list.  A single
7729  * element inversion list that contains the single code point N generally
7730  * consists of two elements
7731  *  invlist[0] == N
7732  *  invlist[1] == N+1
7733  * (The exception is when N is the highest representable value on the
7734  * machine, in which case the list containing just it would be a single
7735  * element, itself.  By extension, if the last range in the list extends to
7736  * infinity, then the first element of that range will be in the inversion list
7737  * at a position that is divisible by two, and is the final element in the
7738  * list.)
7739  * Taking the complement (inverting) an inversion list is quite simple, if the
7740  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7741  * This implementation reserves an element at the beginning of each inversion
7742  * list to always contain 0; there is an additional flag in the header which
7743  * indicates if the list begins at the 0, or is offset to begin at the next
7744  * element.
7745  *
7746  * More about inversion lists can be found in "Unicode Demystified"
7747  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7748  * More will be coming when functionality is added later.
7749  *
7750  * The inversion list data structure is currently implemented as an SV pointing
7751  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7752  * array of UV whose memory management is automatically handled by the existing
7753  * facilities for SV's.
7754  *
7755  * Some of the methods should always be private to the implementation, and some
7756  * should eventually be made public */
7757
7758 /* The header definitions are in F<inline_invlist.c> */
7759
7760 PERL_STATIC_INLINE UV*
7761 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7762 {
7763     /* Returns a pointer to the first element in the inversion list's array.
7764      * This is called upon initialization of an inversion list.  Where the
7765      * array begins depends on whether the list has the code point U+0000 in it
7766      * or not.  The other parameter tells it whether the code that follows this
7767      * call is about to put a 0 in the inversion list or not.  The first
7768      * element is either the element reserved for 0, if TRUE, or the element
7769      * after it, if FALSE */
7770
7771     bool* offset = get_invlist_offset_addr(invlist);
7772     UV* zero_addr = (UV *) SvPVX(invlist);
7773
7774     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7775
7776     /* Must be empty */
7777     assert(! _invlist_len(invlist));
7778
7779     *zero_addr = 0;
7780
7781     /* 1^1 = 0; 1^0 = 1 */
7782     *offset = 1 ^ will_have_0;
7783     return zero_addr + *offset;
7784 }
7785
7786 PERL_STATIC_INLINE UV*
7787 S_invlist_array(pTHX_ SV* const invlist)
7788 {
7789     /* Returns the pointer to the inversion list's array.  Every time the
7790      * length changes, this needs to be called in case malloc or realloc moved
7791      * it */
7792
7793     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7794
7795     /* Must not be empty.  If these fail, you probably didn't check for <len>
7796      * being non-zero before trying to get the array */
7797     assert(_invlist_len(invlist));
7798
7799     /* The very first element always contains zero, The array begins either
7800      * there, or if the inversion list is offset, at the element after it.
7801      * The offset header field determines which; it contains 0 or 1 to indicate
7802      * how much additionally to add */
7803     assert(0 == *(SvPVX(invlist)));
7804     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7805 }
7806
7807 PERL_STATIC_INLINE void
7808 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7809 {
7810     /* Sets the current number of elements stored in the inversion list.
7811      * Updates SvCUR correspondingly */
7812
7813     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7814
7815     assert(SvTYPE(invlist) == SVt_INVLIST);
7816
7817     SvCUR_set(invlist,
7818               (len == 0)
7819                ? 0
7820                : TO_INTERNAL_SIZE(len + offset));
7821     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7822 }
7823
7824 PERL_STATIC_INLINE IV*
7825 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7826 {
7827     /* Return the address of the IV that is reserved to hold the cached index
7828      * */
7829
7830     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7831
7832     assert(SvTYPE(invlist) == SVt_INVLIST);
7833
7834     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7835 }
7836
7837 PERL_STATIC_INLINE IV
7838 S_invlist_previous_index(pTHX_ SV* const invlist)
7839 {
7840     /* Returns cached index of previous search */
7841
7842     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7843
7844     return *get_invlist_previous_index_addr(invlist);
7845 }
7846
7847 PERL_STATIC_INLINE void
7848 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7849 {
7850     /* Caches <index> for later retrieval */
7851
7852     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7853
7854     assert(index == 0 || index < (int) _invlist_len(invlist));
7855
7856     *get_invlist_previous_index_addr(invlist) = index;
7857 }
7858
7859 PERL_STATIC_INLINE UV
7860 S_invlist_max(pTHX_ SV* const invlist)
7861 {
7862     /* Returns the maximum number of elements storable in the inversion list's
7863      * array, without having to realloc() */
7864
7865     PERL_ARGS_ASSERT_INVLIST_MAX;
7866
7867     assert(SvTYPE(invlist) == SVt_INVLIST);
7868
7869     /* Assumes worst case, in which the 0 element is not counted in the
7870      * inversion list, so subtracts 1 for that */
7871     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7872            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7873            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7874 }
7875
7876 #ifndef PERL_IN_XSUB_RE
7877 SV*
7878 Perl__new_invlist(pTHX_ IV initial_size)
7879 {
7880
7881     /* Return a pointer to a newly constructed inversion list, with enough
7882      * space to store 'initial_size' elements.  If that number is negative, a
7883      * system default is used instead */
7884
7885     SV* new_list;
7886
7887     if (initial_size < 0) {
7888         initial_size = 10;
7889     }
7890
7891     /* Allocate the initial space */
7892     new_list = newSV_type(SVt_INVLIST);
7893
7894     /* First 1 is in case the zero element isn't in the list; second 1 is for
7895      * trailing NUL */
7896     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7897     invlist_set_len(new_list, 0, 0);
7898
7899     /* Force iterinit() to be used to get iteration to work */
7900     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7901
7902     *get_invlist_previous_index_addr(new_list) = 0;
7903
7904     return new_list;
7905 }
7906
7907 SV*
7908 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7909 {
7910     /* Return a pointer to a newly constructed inversion list, initialized to
7911      * point to <list>, which has to be in the exact correct inversion list
7912      * form, including internal fields.  Thus this is a dangerous routine that
7913      * should not be used in the wrong hands.  The passed in 'list' contains
7914      * several header fields at the beginning that are not part of the
7915      * inversion list body proper */
7916
7917     const STRLEN length = (STRLEN) list[0];
7918     const UV version_id =          list[1];
7919     const bool offset   =    cBOOL(list[2]);
7920 #define HEADER_LENGTH 3
7921     /* If any of the above changes in any way, you must change HEADER_LENGTH
7922      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7923      *      perl -E 'say int(rand 2**31-1)'
7924      */
7925 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7926                                         data structure type, so that one being
7927                                         passed in can be validated to be an
7928                                         inversion list of the correct vintage.
7929                                        */
7930
7931     SV* invlist = newSV_type(SVt_INVLIST);
7932
7933     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7934
7935     if (version_id != INVLIST_VERSION_ID) {
7936         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7937     }
7938
7939     /* The generated array passed in includes header elements that aren't part
7940      * of the list proper, so start it just after them */
7941     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7942
7943     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7944                                shouldn't touch it */
7945
7946     *(get_invlist_offset_addr(invlist)) = offset;
7947
7948     /* The 'length' passed to us is the physical number of elements in the
7949      * inversion list.  But if there is an offset the logical number is one
7950      * less than that */
7951     invlist_set_len(invlist, length  - offset, offset);
7952
7953     invlist_set_previous_index(invlist, 0);
7954
7955     /* Initialize the iteration pointer. */
7956     invlist_iterfinish(invlist);
7957
7958     SvREADONLY_on(invlist);
7959
7960     return invlist;
7961 }
7962 #endif /* ifndef PERL_IN_XSUB_RE */
7963
7964 STATIC void
7965 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7966 {
7967     /* Grow the maximum size of an inversion list */
7968
7969     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7970
7971     assert(SvTYPE(invlist) == SVt_INVLIST);
7972
7973     /* Add one to account for the zero element at the beginning which may not
7974      * be counted by the calling parameters */
7975     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7976 }
7977
7978 PERL_STATIC_INLINE void
7979 S_invlist_trim(pTHX_ SV* const invlist)
7980 {
7981     PERL_ARGS_ASSERT_INVLIST_TRIM;
7982
7983     assert(SvTYPE(invlist) == SVt_INVLIST);
7984
7985     /* Change the length of the inversion list to how many entries it currently
7986      * has */
7987     SvPV_shrink_to_cur((SV *) invlist);
7988 }
7989
7990 STATIC void
7991 S__append_range_to_invlist(pTHX_ SV* const invlist,
7992                                  const UV start, const UV end)
7993 {
7994    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7995     * the end of the inversion list.  The range must be above any existing
7996     * ones. */
7997
7998     UV* array;
7999     UV max = invlist_max(invlist);
8000     UV len = _invlist_len(invlist);
8001     bool offset;
8002
8003     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8004
8005     if (len == 0) { /* Empty lists must be initialized */
8006         offset = start != 0;
8007         array = _invlist_array_init(invlist, ! offset);
8008     }
8009     else {
8010         /* Here, the existing list is non-empty. The current max entry in the
8011          * list is generally the first value not in the set, except when the
8012          * set extends to the end of permissible values, in which case it is
8013          * the first entry in that final set, and so this call is an attempt to
8014          * append out-of-order */
8015
8016         UV final_element = len - 1;
8017         array = invlist_array(invlist);
8018         if (array[final_element] > start
8019             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8020         {
8021             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8022                      array[final_element], start,
8023                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8024         }
8025
8026         /* Here, it is a legal append.  If the new range begins with the first
8027          * value not in the set, it is extending the set, so the new first
8028          * value not in the set is one greater than the newly extended range.
8029          * */
8030         offset = *get_invlist_offset_addr(invlist);
8031         if (array[final_element] == start) {
8032             if (end != UV_MAX) {
8033                 array[final_element] = end + 1;
8034             }
8035             else {
8036                 /* But if the end is the maximum representable on the machine,
8037                  * just let the range that this would extend to have no end */
8038                 invlist_set_len(invlist, len - 1, offset);
8039             }
8040             return;
8041         }
8042     }
8043
8044     /* Here the new range doesn't extend any existing set.  Add it */
8045
8046     len += 2;   /* Includes an element each for the start and end of range */
8047
8048     /* If wll overflow the existing space, extend, which may cause the array to
8049      * be moved */
8050     if (max < len) {
8051         invlist_extend(invlist, len);
8052
8053         /* Have to set len here to avoid assert failure in invlist_array() */
8054         invlist_set_len(invlist, len, offset);
8055
8056         array = invlist_array(invlist);
8057     }
8058     else {
8059         invlist_set_len(invlist, len, offset);
8060     }
8061
8062     /* The next item on the list starts the range, the one after that is
8063      * one past the new range.  */
8064     array[len - 2] = start;
8065     if (end != UV_MAX) {
8066         array[len - 1] = end + 1;
8067     }
8068     else {
8069         /* But if the end is the maximum representable on the machine, just let
8070          * the range have no end */
8071         invlist_set_len(invlist, len - 1, offset);
8072     }
8073 }
8074
8075 #ifndef PERL_IN_XSUB_RE
8076
8077 IV
8078 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8079 {
8080     /* Searches the inversion list for the entry that contains the input code
8081      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8082      * return value is the index into the list's array of the range that
8083      * contains <cp> */
8084
8085     IV low = 0;
8086     IV mid;
8087     IV high = _invlist_len(invlist);
8088     const IV highest_element = high - 1;
8089     const UV* array;
8090
8091     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8092
8093     /* If list is empty, return failure. */
8094     if (high == 0) {
8095         return -1;
8096     }
8097
8098     /* (We can't get the array unless we know the list is non-empty) */
8099     array = invlist_array(invlist);
8100
8101     mid = invlist_previous_index(invlist);
8102     assert(mid >=0 && mid <= highest_element);
8103
8104     /* <mid> contains the cache of the result of the previous call to this
8105      * function (0 the first time).  See if this call is for the same result,
8106      * or if it is for mid-1.  This is under the theory that calls to this
8107      * function will often be for related code points that are near each other.
8108      * And benchmarks show that caching gives better results.  We also test
8109      * here if the code point is within the bounds of the list.  These tests
8110      * replace others that would have had to be made anyway to make sure that
8111      * the array bounds were not exceeded, and these give us extra information
8112      * at the same time */
8113     if (cp >= array[mid]) {
8114         if (cp >= array[highest_element]) {
8115             return highest_element;
8116         }
8117
8118         /* Here, array[mid] <= cp < array[highest_element].  This means that
8119          * the final element is not the answer, so can exclude it; it also
8120          * means that <mid> is not the final element, so can refer to 'mid + 1'
8121          * safely */
8122         if (cp < array[mid + 1]) {
8123             return mid;
8124         }
8125         high--;
8126         low = mid + 1;
8127     }
8128     else { /* cp < aray[mid] */
8129         if (cp < array[0]) { /* Fail if outside the array */
8130             return -1;
8131         }
8132         high = mid;
8133         if (cp >= array[mid - 1]) {
8134             goto found_entry;
8135         }
8136     }
8137
8138     /* Binary search.  What we are looking for is <i> such that
8139      *  array[i] <= cp < array[i+1]
8140      * The loop below converges on the i+1.  Note that there may not be an
8141      * (i+1)th element in the array, and things work nonetheless */
8142     while (low < high) {
8143         mid = (low + high) / 2;
8144         assert(mid <= highest_element);
8145         if (array[mid] <= cp) { /* cp >= array[mid] */
8146             low = mid + 1;
8147
8148             /* We could do this extra test to exit the loop early.
8149             if (cp < array[low]) {
8150                 return mid;
8151             }
8152             */
8153         }
8154         else { /* cp < array[mid] */
8155             high = mid;
8156         }
8157     }
8158
8159   found_entry:
8160     high--;
8161     invlist_set_previous_index(invlist, high);
8162     return high;
8163 }
8164
8165 void
8166 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8167                                     const UV start, const UV end, U8* swatch)
8168 {
8169     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8170      * but is used when the swash has an inversion list.  This makes this much
8171      * faster, as it uses a binary search instead of a linear one.  This is
8172      * intimately tied to that function, and perhaps should be in utf8.c,
8173      * except it is intimately tied to inversion lists as well.  It assumes
8174      * that <swatch> is all 0's on input */
8175
8176     UV current = start;
8177     const IV len = _invlist_len(invlist);
8178     IV i;
8179     const UV * array;
8180
8181     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8182
8183     if (len == 0) { /* Empty inversion list */
8184         return;
8185     }
8186
8187     array = invlist_array(invlist);
8188
8189     /* Find which element it is */
8190     i = _invlist_search(invlist, start);
8191
8192     /* We populate from <start> to <end> */
8193     while (current < end) {
8194         UV upper;
8195
8196         /* The inversion list gives the results for every possible code point
8197          * after the first one in the list.  Only those ranges whose index is
8198          * even are ones that the inversion list matches.  For the odd ones,
8199          * and if the initial code point is not in the list, we have to skip
8200          * forward to the next element */
8201         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8202             i++;
8203             if (i >= len) { /* Finished if beyond the end of the array */
8204                 return;
8205             }
8206             current = array[i];
8207             if (current >= end) {   /* Finished if beyond the end of what we
8208                                        are populating */
8209                 if (LIKELY(end < UV_MAX)) {
8210                     return;
8211                 }
8212
8213                 /* We get here when the upper bound is the maximum
8214                  * representable on the machine, and we are looking for just
8215                  * that code point.  Have to special case it */
8216                 i = len;
8217                 goto join_end_of_list;
8218             }
8219         }
8220         assert(current >= start);
8221
8222         /* The current range ends one below the next one, except don't go past
8223          * <end> */
8224         i++;
8225         upper = (i < len && array[i] < end) ? array[i] : end;
8226
8227         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8228          * for each code point in it */
8229         for (; current < upper; current++) {
8230             const STRLEN offset = (STRLEN)(current - start);
8231             swatch[offset >> 3] |= 1 << (offset & 7);
8232         }
8233
8234     join_end_of_list:
8235
8236         /* Quit if at the end of the list */
8237         if (i >= len) {
8238
8239             /* But first, have to deal with the highest possible code point on
8240              * the platform.  The previous code assumes that <end> is one
8241              * beyond where we want to populate, but that is impossible at the
8242              * platform's infinity, so have to handle it specially */
8243             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8244             {
8245                 const STRLEN offset = (STRLEN)(end - start);
8246                 swatch[offset >> 3] |= 1 << (offset & 7);
8247             }
8248             return;
8249         }
8250
8251         /* Advance to the next range, which will be for code points not in the
8252          * inversion list */
8253         current = array[i];
8254     }
8255
8256     return;
8257 }
8258
8259 void
8260 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8261                                          const bool complement_b, SV** output)
8262 {
8263     /* Take the union of two inversion lists and point <output> to it.  *output
8264      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8265      * the reference count to that list will be decremented if not already a
8266      * temporary (mortal); otherwise *output will be made correspondingly
8267      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8268      * second list is returned.  If <complement_b> is TRUE, the union is taken
8269      * of the complement (inversion) of <b> instead of b itself.
8270      *
8271      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8272      * Richard Gillam, published by Addison-Wesley, and explained at some
8273      * length there.  The preface says to incorporate its examples into your
8274      * code at your own risk.
8275      *
8276      * The algorithm is like a merge sort.
8277      *
8278      * XXX A potential performance improvement is to keep track as we go along
8279      * if only one of the inputs contributes to the result, meaning the other
8280      * is a subset of that one.  In that case, we can skip the final copy and
8281      * return the larger of the input lists, but then outside code might need
8282      * to keep track of whether to free the input list or not */
8283
8284     const UV* array_a;    /* a's array */
8285     const UV* array_b;
8286     UV len_a;       /* length of a's array */
8287     UV len_b;
8288
8289     SV* u;                      /* the resulting union */
8290     UV* array_u;
8291     UV len_u;
8292
8293     UV i_a = 0;             /* current index into a's array */
8294     UV i_b = 0;
8295     UV i_u = 0;
8296
8297     /* running count, as explained in the algorithm source book; items are
8298      * stopped accumulating and are output when the count changes to/from 0.
8299      * The count is incremented when we start a range that's in the set, and
8300      * decremented when we start a range that's not in the set.  So its range
8301      * is 0 to 2.  Only when the count is zero is something not in the set.
8302      */
8303     UV count = 0;
8304
8305     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8306     assert(a != b);
8307
8308     /* If either one is empty, the union is the other one */
8309     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8310         bool make_temp = FALSE; /* Should we mortalize the result? */
8311
8312         if (*output == a) {
8313             if (a != NULL) {
8314                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8315                     SvREFCNT_dec_NN(a);
8316                 }
8317             }
8318         }
8319         if (*output != b) {
8320             *output = invlist_clone(b);
8321             if (complement_b) {
8322                 _invlist_invert(*output);
8323             }
8324         } /* else *output already = b; */
8325
8326         if (make_temp) {
8327             sv_2mortal(*output);
8328         }
8329         return;
8330     }
8331     else if ((len_b = _invlist_len(b)) == 0) {
8332         bool make_temp = FALSE;
8333         if (*output == b) {
8334             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8335                 SvREFCNT_dec_NN(b);
8336             }
8337         }
8338
8339         /* The complement of an empty list is a list that has everything in it,
8340          * so the union with <a> includes everything too */
8341         if (complement_b) {
8342             if (a == *output) {
8343                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8344                     SvREFCNT_dec_NN(a);
8345                 }
8346             }
8347             *output = _new_invlist(1);
8348             _append_range_to_invlist(*output, 0, UV_MAX);
8349         }
8350         else if (*output != a) {
8351             *output = invlist_clone(a);
8352         }
8353         /* else *output already = a; */
8354
8355         if (make_temp) {
8356             sv_2mortal(*output);
8357         }
8358         return;
8359     }
8360
8361     /* Here both lists exist and are non-empty */
8362     array_a = invlist_array(a);
8363     array_b = invlist_array(b);
8364
8365     /* If are to take the union of 'a' with the complement of b, set it
8366      * up so are looking at b's complement. */
8367     if (complement_b) {
8368
8369         /* To complement, we invert: if the first element is 0, remove it.  To
8370          * do this, we just pretend the array starts one later */
8371         if (array_b[0] == 0) {
8372             array_b++;
8373             len_b--;
8374         }
8375         else {
8376
8377             /* But if the first element is not zero, we pretend the list starts
8378              * at the 0 that is always stored immediately before the array. */
8379             array_b--;
8380             len_b++;
8381         }
8382     }
8383
8384     /* Size the union for the worst case: that the sets are completely
8385      * disjoint */
8386     u = _new_invlist(len_a + len_b);
8387
8388     /* Will contain U+0000 if either component does */
8389     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8390                                       || (len_b > 0 && array_b[0] == 0));
8391
8392     /* Go through each list item by item, stopping when exhausted one of
8393      * them */
8394     while (i_a < len_a && i_b < len_b) {
8395         UV cp;      /* The element to potentially add to the union's array */
8396         bool cp_in_set;   /* is it in the the input list's set or not */
8397
8398         /* We need to take one or the other of the two inputs for the union.
8399          * Since we are merging two sorted lists, we take the smaller of the
8400          * next items.  In case of a tie, we take the one that is in its set
8401          * first.  If we took one not in the set first, it would decrement the
8402          * count, possibly to 0 which would cause it to be output as ending the
8403          * range, and the next time through we would take the same number, and
8404          * output it again as beginning the next range.  By doing it the
8405          * opposite way, there is no possibility that the count will be
8406          * momentarily decremented to 0, and thus the two adjoining ranges will
8407          * be seamlessly merged.  (In a tie and both are in the set or both not
8408          * in the set, it doesn't matter which we take first.) */
8409         if (array_a[i_a] < array_b[i_b]
8410             || (array_a[i_a] == array_b[i_b]
8411                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8412         {
8413             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8414             cp= array_a[i_a++];
8415         }
8416         else {
8417             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8418             cp = array_b[i_b++];
8419         }
8420
8421         /* Here, have chosen which of the two inputs to look at.  Only output
8422          * if the running count changes to/from 0, which marks the
8423          * beginning/end of a range in that's in the set */
8424         if (cp_in_set) {
8425             if (count == 0) {
8426                 array_u[i_u++] = cp;
8427             }
8428             count++;
8429         }
8430         else {
8431             count--;
8432             if (count == 0) {
8433                 array_u[i_u++] = cp;
8434             }
8435         }
8436     }
8437
8438     /* Here, we are finished going through at least one of the lists, which
8439      * means there is something remaining in at most one.  We check if the list
8440      * that hasn't been exhausted is positioned such that we are in the middle
8441      * of a range in its set or not.  (i_a and i_b point to the element beyond
8442      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8443      * is potentially more to output.
8444      * There are four cases:
8445      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8446      *     in the union is entirely from the non-exhausted set.
8447      *  2) Both were in their sets, count is 2.  Nothing further should
8448      *     be output, as everything that remains will be in the exhausted
8449      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8450      *     that
8451      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8452      *     Nothing further should be output because the union includes
8453      *     everything from the exhausted set.  Not decrementing ensures that.
8454      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8455      *     decrementing to 0 insures that we look at the remainder of the
8456      *     non-exhausted set */
8457     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8458         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8459     {
8460         count--;
8461     }
8462
8463     /* The final length is what we've output so far, plus what else is about to
8464      * be output.  (If 'count' is non-zero, then the input list we exhausted
8465      * has everything remaining up to the machine's limit in its set, and hence
8466      * in the union, so there will be no further output. */
8467     len_u = i_u;
8468     if (count == 0) {
8469         /* At most one of the subexpressions will be non-zero */
8470         len_u += (len_a - i_a) + (len_b - i_b);
8471     }
8472
8473     /* Set result to final length, which can change the pointer to array_u, so
8474      * re-find it */
8475     if (len_u != _invlist_len(u)) {
8476         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8477         invlist_trim(u);
8478         array_u = invlist_array(u);
8479     }
8480
8481     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8482      * the other) ended with everything above it not in its set.  That means
8483      * that the remaining part of the union is precisely the same as the
8484      * non-exhausted list, so can just copy it unchanged.  (If both list were
8485      * exhausted at the same time, then the operations below will be both 0.)
8486      */
8487     if (count == 0) {
8488         IV copy_count; /* At most one will have a non-zero copy count */
8489         if ((copy_count = len_a - i_a) > 0) {
8490             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8491         }
8492         else if ((copy_count = len_b - i_b) > 0) {
8493             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8494         }
8495     }
8496
8497     /*  We may be removing a reference to one of the inputs.  If so, the output
8498      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8499      *  count decremented) */
8500     if (a == *output || b == *output) {
8501         assert(! invlist_is_iterating(*output));
8502         if ((SvTEMP(*output))) {
8503             sv_2mortal(u);
8504         }
8505         else {
8506             SvREFCNT_dec_NN(*output);
8507         }
8508     }
8509
8510     *output = u;
8511
8512     return;
8513 }
8514
8515 void
8516 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8517                                                const bool complement_b, SV** i)
8518 {
8519     /* Take the intersection of two inversion lists and point <i> to it.  *i
8520      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8521      * the reference count to that list will be decremented if not already a
8522      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8523      * The first list, <a>, may be NULL, in which case an empty list is
8524      * returned.  If <complement_b> is TRUE, the result will be the
8525      * intersection of <a> and the complement (or inversion) of <b> instead of
8526      * <b> directly.
8527      *
8528      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8529      * Richard Gillam, published by Addison-Wesley, and explained at some
8530      * length there.  The preface says to incorporate its examples into your
8531      * code at your own risk.  In fact, it had bugs
8532      *
8533      * The algorithm is like a merge sort, and is essentially the same as the
8534      * union above
8535      */
8536
8537     const UV* array_a;          /* a's array */
8538     const UV* array_b;
8539     UV len_a;   /* length of a's array */
8540     UV len_b;
8541
8542     SV* r;                   /* the resulting intersection */
8543     UV* array_r;
8544     UV len_r;
8545
8546     UV i_a = 0;             /* current index into a's array */
8547     UV i_b = 0;
8548     UV i_r = 0;
8549
8550     /* running count, as explained in the algorithm source book; items are
8551      * stopped accumulating and are output when the count changes to/from 2.
8552      * The count is incremented when we start a range that's in the set, and
8553      * decremented when we start a range that's not in the set.  So its range
8554      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8555      */
8556     UV count = 0;
8557
8558     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8559     assert(a != b);
8560
8561     /* Special case if either one is empty */
8562     len_a = (a == NULL) ? 0 : _invlist_len(a);
8563     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8564         bool make_temp = FALSE;
8565
8566         if (len_a != 0 && complement_b) {
8567
8568             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8569              * be empty.  Here, also we are using 'b's complement, which hence
8570              * must be every possible code point.  Thus the intersection is
8571              * simply 'a'. */
8572             if (*i != a) {
8573                 if (*i == b) {
8574                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8575                         SvREFCNT_dec_NN(b);
8576                     }
8577                 }
8578
8579                 *i = invlist_clone(a);
8580             }
8581             /* else *i is already 'a' */
8582
8583             if (make_temp) {
8584                 sv_2mortal(*i);
8585             }
8586             return;
8587         }
8588
8589         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8590          * intersection must be empty */
8591         if (*i == a) {
8592             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8593                 SvREFCNT_dec_NN(a);
8594             }
8595         }
8596         else if (*i == b) {
8597             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8598                 SvREFCNT_dec_NN(b);
8599             }
8600         }
8601         *i = _new_invlist(0);
8602         if (make_temp) {
8603             sv_2mortal(*i);
8604         }
8605
8606         return;
8607     }
8608
8609     /* Here both lists exist and are non-empty */
8610     array_a = invlist_array(a);
8611     array_b = invlist_array(b);
8612
8613     /* If are to take the intersection of 'a' with the complement of b, set it
8614      * up so are looking at b's complement. */
8615     if (complement_b) {
8616
8617         /* To complement, we invert: if the first element is 0, remove it.  To
8618          * do this, we just pretend the array starts one later */
8619         if (array_b[0] == 0) {
8620             array_b++;
8621             len_b--;
8622         }
8623         else {
8624
8625             /* But if the first element is not zero, we pretend the list starts
8626              * at the 0 that is always stored immediately before the array. */
8627             array_b--;
8628             len_b++;
8629         }
8630     }
8631
8632     /* Size the intersection for the worst case: that the intersection ends up
8633      * fragmenting everything to be completely disjoint */
8634     r= _new_invlist(len_a + len_b);
8635
8636     /* Will contain U+0000 iff both components do */
8637     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8638                                      && len_b > 0 && array_b[0] == 0);
8639
8640     /* Go through each list item by item, stopping when exhausted one of
8641      * them */
8642     while (i_a < len_a && i_b < len_b) {
8643         UV cp;      /* The element to potentially add to the intersection's
8644                        array */
8645         bool cp_in_set; /* Is it in the input list's set or not */
8646
8647         /* We need to take one or the other of the two inputs for the
8648          * intersection.  Since we are merging two sorted lists, we take the
8649          * smaller of the next items.  In case of a tie, we take the one that
8650          * is not in its set first (a difference from the union algorithm).  If
8651          * we took one in the set first, it would increment the count, possibly
8652          * to 2 which would cause it to be output as starting a range in the
8653          * intersection, and the next time through we would take that same
8654          * number, and output it again as ending the set.  By doing it the
8655          * opposite of this, there is no possibility that the count will be
8656          * momentarily incremented to 2.  (In a tie and both are in the set or
8657          * both not in the set, it doesn't matter which we take first.) */
8658         if (array_a[i_a] < array_b[i_b]
8659             || (array_a[i_a] == array_b[i_b]
8660                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8661         {
8662             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8663             cp= array_a[i_a++];
8664         }
8665         else {
8666             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8667             cp= array_b[i_b++];
8668         }
8669
8670         /* Here, have chosen which of the two inputs to look at.  Only output
8671          * if the running count changes to/from 2, which marks the
8672          * beginning/end of a range that's in the intersection */
8673         if (cp_in_set) {
8674             count++;
8675             if (count == 2) {
8676                 array_r[i_r++] = cp;
8677             }
8678         }
8679         else {
8680             if (count == 2) {
8681                 array_r[i_r++] = cp;
8682             }
8683             count--;
8684         }
8685     }
8686
8687     /* Here, we are finished going through at least one of the lists, which
8688      * means there is something remaining in at most one.  We check if the list
8689      * that has been exhausted is positioned such that we are in the middle
8690      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8691      * the ones we care about.)  There are four cases:
8692      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8693      *     nothing left in the intersection.
8694      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8695      *     above 2.  What should be output is exactly that which is in the
8696      *     non-exhausted set, as everything it has is also in the intersection
8697      *     set, and everything it doesn't have can't be in the intersection
8698      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8699      *     gets incremented to 2.  Like the previous case, the intersection is
8700      *     everything that remains in the non-exhausted set.
8701      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8702      *     remains 1.  And the intersection has nothing more. */
8703     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8704         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8705     {
8706         count++;
8707     }
8708
8709     /* The final length is what we've output so far plus what else is in the
8710      * intersection.  At most one of the subexpressions below will be non-zero
8711      * */
8712     len_r = i_r;
8713     if (count >= 2) {
8714         len_r += (len_a - i_a) + (len_b - i_b);
8715     }
8716
8717     /* Set result to final length, which can change the pointer to array_r, so
8718      * re-find it */
8719     if (len_r != _invlist_len(r)) {
8720         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8721         invlist_trim(r);
8722         array_r = invlist_array(r);
8723     }
8724
8725     /* Finish outputting any remaining */
8726     if (count >= 2) { /* At most one will have a non-zero copy count */
8727         IV copy_count;
8728         if ((copy_count = len_a - i_a) > 0) {
8729             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8730         }
8731         else if ((copy_count = len_b - i_b) > 0) {
8732             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8733         }
8734     }
8735
8736     /*  We may be removing a reference to one of the inputs.  If so, the output
8737      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8738      *  count decremented) */
8739     if (a == *i || b == *i) {
8740         assert(! invlist_is_iterating(*i));
8741         if (SvTEMP(*i)) {
8742             sv_2mortal(r);
8743         }
8744         else {
8745             SvREFCNT_dec_NN(*i);
8746         }
8747     }
8748
8749     *i = r;
8750
8751     return;
8752 }
8753
8754 SV*
8755 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8756 {
8757     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8758      * set.  A pointer to the inversion list is returned.  This may actually be
8759      * a new list, in which case the passed in one has been destroyed.  The
8760      * passed in inversion list can be NULL, in which case a new one is created
8761      * with just the one range in it */
8762
8763     SV* range_invlist;
8764     UV len;
8765
8766     if (invlist == NULL) {
8767         invlist = _new_invlist(2);
8768         len = 0;
8769     }
8770     else {
8771         len = _invlist_len(invlist);
8772     }
8773
8774     /* If comes after the final entry actually in the list, can just append it
8775      * to the end, */
8776     if (len == 0
8777         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8778             && start >= invlist_array(invlist)[len - 1]))
8779     {
8780         _append_range_to_invlist(invlist, start, end);
8781         return invlist;
8782     }
8783
8784     /* Here, can't just append things, create and return a new inversion list
8785      * which is the union of this range and the existing inversion list */
8786     range_invlist = _new_invlist(2);
8787     _append_range_to_invlist(range_invlist, start, end);
8788
8789     _invlist_union(invlist, range_invlist, &invlist);
8790
8791     /* The temporary can be freed */
8792     SvREFCNT_dec_NN(range_invlist);
8793
8794     return invlist;
8795 }
8796
8797 SV*
8798 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8799                                  UV** other_elements_ptr)
8800 {
8801     /* Create and return an inversion list whose contents are to be populated
8802      * by the caller.  The caller gives the number of elements (in 'size') and
8803      * the very first element ('element0').  This function will set
8804      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8805      * are to be placed.
8806      *
8807      * Obviously there is some trust involved that the caller will properly
8808      * fill in the other elements of the array.
8809      *
8810      * (The first element needs to be passed in, as the underlying code does
8811      * things differently depending on whether it is zero or non-zero) */
8812
8813     SV* invlist = _new_invlist(size);
8814     bool offset;
8815
8816     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8817
8818     _append_range_to_invlist(invlist, element0, element0);
8819     offset = *get_invlist_offset_addr(invlist);
8820
8821     invlist_set_len(invlist, size, offset);
8822     *other_elements_ptr = invlist_array(invlist) + 1;
8823     return invlist;
8824 }
8825
8826 #endif
8827
8828 PERL_STATIC_INLINE SV*
8829 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8830     return _add_range_to_invlist(invlist, cp, cp);
8831 }
8832
8833 #ifndef PERL_IN_XSUB_RE
8834 void
8835 Perl__invlist_invert(pTHX_ SV* const invlist)
8836 {
8837     /* Complement the input inversion list.  This adds a 0 if the list didn't
8838      * have a zero; removes it otherwise.  As described above, the data
8839      * structure is set up so that this is very efficient */
8840
8841     PERL_ARGS_ASSERT__INVLIST_INVERT;
8842
8843     assert(! invlist_is_iterating(invlist));
8844
8845     /* The inverse of matching nothing is matching everything */
8846     if (_invlist_len(invlist) == 0) {
8847         _append_range_to_invlist(invlist, 0, UV_MAX);
8848         return;
8849     }
8850
8851     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8852 }
8853
8854 #endif
8855
8856 PERL_STATIC_INLINE SV*
8857 S_invlist_clone(pTHX_ SV* const invlist)
8858 {
8859
8860     /* Return a new inversion list that is a copy of the input one, which is
8861      * unchanged.  The new list will not be mortal even if the old one was. */
8862
8863     /* Need to allocate extra space to accommodate Perl's addition of a
8864      * trailing NUL to SvPV's, since it thinks they are always strings */
8865     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8866     STRLEN physical_length = SvCUR(invlist);
8867     bool offset = *(get_invlist_offset_addr(invlist));
8868
8869     PERL_ARGS_ASSERT_INVLIST_CLONE;
8870
8871     *(get_invlist_offset_addr(new_invlist)) = offset;
8872     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8873     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8874
8875     return new_invlist;
8876 }
8877
8878 PERL_STATIC_INLINE STRLEN*
8879 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8880 {
8881     /* Return the address of the UV that contains the current iteration
8882      * position */
8883
8884     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8885
8886     assert(SvTYPE(invlist) == SVt_INVLIST);
8887
8888     return &(((XINVLIST*) SvANY(invlist))->iterator);
8889 }
8890
8891 PERL_STATIC_INLINE void
8892 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8893 {
8894     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8895
8896     *get_invlist_iter_addr(invlist) = 0;
8897 }
8898
8899 PERL_STATIC_INLINE void
8900 S_invlist_iterfinish(pTHX_ SV* invlist)
8901 {
8902     /* Terminate iterator for invlist.  This is to catch development errors.
8903      * Any iteration that is interrupted before completed should call this
8904      * function.  Functions that add code points anywhere else but to the end
8905      * of an inversion list assert that they are not in the middle of an
8906      * iteration.  If they were, the addition would make the iteration
8907      * problematical: if the iteration hadn't reached the place where things
8908      * were being added, it would be ok */
8909
8910     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8911
8912     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8913 }
8914
8915 STATIC bool
8916 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8917 {
8918     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8919      * This call sets in <*start> and <*end>, the next range in <invlist>.
8920      * Returns <TRUE> if successful and the next call will return the next
8921      * range; <FALSE> if was already at the end of the list.  If the latter,
8922      * <*start> and <*end> are unchanged, and the next call to this function
8923      * will start over at the beginning of the list */
8924
8925     STRLEN* pos = get_invlist_iter_addr(invlist);
8926     UV len = _invlist_len(invlist);
8927     UV *array;
8928
8929     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8930
8931     if (*pos >= len) {
8932         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8933         return FALSE;
8934     }
8935
8936     array = invlist_array(invlist);
8937
8938     *start = array[(*pos)++];
8939
8940     if (*pos >= len) {
8941         *end = UV_MAX;
8942     }
8943     else {
8944         *end = array[(*pos)++] - 1;
8945     }
8946
8947     return TRUE;
8948 }
8949
8950 PERL_STATIC_INLINE bool
8951 S_invlist_is_iterating(pTHX_ SV* const invlist)
8952 {
8953     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8954
8955     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8956 }
8957
8958 PERL_STATIC_INLINE UV
8959 S_invlist_highest(pTHX_ SV* const invlist)
8960 {
8961     /* Returns the highest code point that matches an inversion list.  This API
8962      * has an ambiguity, as it returns 0 under either the highest is actually
8963      * 0, or if the list is empty.  If this distinction matters to you, check
8964      * for emptiness before calling this function */
8965
8966     UV len = _invlist_len(invlist);
8967     UV *array;
8968
8969     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8970
8971     if (len == 0) {
8972         return 0;
8973     }
8974
8975     array = invlist_array(invlist);
8976
8977     /* The last element in the array in the inversion list always starts a
8978      * range that goes to infinity.  That range may be for code points that are
8979      * matched in the inversion list, or it may be for ones that aren't
8980      * matched.  In the latter case, the highest code point in the set is one
8981      * less than the beginning of this range; otherwise it is the final element
8982      * of this range: infinity */
8983     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8984            ? UV_MAX
8985            : array[len - 1] - 1;
8986 }
8987
8988 #ifndef PERL_IN_XSUB_RE
8989 SV *
8990 Perl__invlist_contents(pTHX_ SV* const invlist)
8991 {
8992     /* Get the contents of an inversion list into a string SV so that they can
8993      * be printed out.  It uses the format traditionally done for debug tracing
8994      */
8995
8996     UV start, end;
8997     SV* output = newSVpvs("\n");
8998
8999     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9000
9001     assert(! invlist_is_iterating(invlist));
9002
9003     invlist_iterinit(invlist);
9004     while (invlist_iternext(invlist, &start, &end)) {
9005         if (end == UV_MAX) {
9006             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9007         }
9008         else if (end != start) {
9009             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9010                     start,       end);
9011         }
9012         else {
9013             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9014         }
9015     }
9016
9017     return output;
9018 }
9019 #endif
9020
9021 #ifndef PERL_IN_XSUB_RE
9022 void
9023 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9024                          const char * const indent, SV* const invlist)
9025 {
9026     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9027      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9028      * the string 'indent'.  The output looks like this:
9029          [0] 0x000A .. 0x000D
9030          [2] 0x0085
9031          [4] 0x2028 .. 0x2029
9032          [6] 0x3104 .. INFINITY
9033      * This means that the first range of code points matched by the list are
9034      * 0xA through 0xD; the second range contains only the single code point
9035      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9036      * are used to define each range (except if the final range extends to
9037      * infinity, only a single element is needed).  The array index of the
9038      * first element for the corresponding range is given in brackets. */
9039
9040     UV start, end;
9041     STRLEN count = 0;
9042
9043     PERL_ARGS_ASSERT__INVLIST_DUMP;
9044
9045     if (invlist_is_iterating(invlist)) {
9046         Perl_dump_indent(aTHX_ level, file,
9047              "%sCan't dump inversion list because is in middle of iterating\n",
9048              indent);
9049         return;
9050     }
9051
9052     invlist_iterinit(invlist);
9053     while (invlist_iternext(invlist, &start, &end)) {
9054         if (end == UV_MAX) {
9055             Perl_dump_indent(aTHX_ level, file,
9056                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9057                                    indent, (UV)count, start);
9058         }
9059         else if (end != start) {
9060             Perl_dump_indent(aTHX_ level, file,
9061                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9062                                 indent, (UV)count, start,         end);
9063         }
9064         else {
9065             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9066                                             indent, (UV)count, start);
9067         }
9068         count += 2;
9069     }
9070 }
9071 #endif
9072
9073 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9074 bool
9075 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9076 {
9077     /* Return a boolean as to if the two passed in inversion lists are
9078      * identical.  The final argument, if TRUE, says to take the complement of
9079      * the second inversion list before doing the comparison */
9080
9081     const UV* array_a = invlist_array(a);
9082     const UV* array_b = invlist_array(b);
9083     UV len_a = _invlist_len(a);
9084     UV len_b = _invlist_len(b);
9085
9086     UV i = 0;               /* current index into the arrays */
9087     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9088
9089     PERL_ARGS_ASSERT__INVLISTEQ;
9090
9091     /* If are to compare 'a' with the complement of b, set it
9092      * up so are looking at b's complement. */
9093     if (complement_b) {
9094
9095         /* The complement of nothing is everything, so <a> would have to have
9096          * just one element, starting at zero (ending at infinity) */
9097         if (len_b == 0) {
9098             return (len_a == 1 && array_a[0] == 0);
9099         }
9100         else if (array_b[0] == 0) {
9101
9102             /* Otherwise, to complement, we invert.  Here, the first element is
9103              * 0, just remove it.  To do this, we just pretend the array starts
9104              * one later */
9105
9106             array_b++;
9107             len_b--;
9108         }
9109         else {
9110
9111             /* But if the first element is not zero, we pretend the list starts
9112              * at the 0 that is always stored immediately before the array. */
9113             array_b--;
9114             len_b++;
9115         }
9116     }
9117
9118     /* Make sure that the lengths are the same, as well as the final element
9119      * before looping through the remainder.  (Thus we test the length, final,
9120      * and first elements right off the bat) */
9121     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9122         retval = FALSE;
9123     }
9124     else for (i = 0; i < len_a - 1; i++) {
9125         if (array_a[i] != array_b[i]) {
9126             retval = FALSE;
9127             break;
9128         }
9129     }
9130
9131     return retval;
9132 }
9133 #endif
9134
9135 #undef HEADER_LENGTH
9136 #undef TO_INTERNAL_SIZE
9137 #undef FROM_INTERNAL_SIZE
9138 #undef INVLIST_VERSION_ID
9139
9140 /* End of inversion list object */
9141
9142 STATIC void
9143 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9144 {
9145     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9146      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9147      * should point to the first flag; it is updated on output to point to the
9148      * final ')' or ':'.  There needs to be at least one flag, or this will
9149      * abort */
9150
9151     /* for (?g), (?gc), and (?o) warnings; warning
9152        about (?c) will warn about (?g) -- japhy    */
9153
9154 #define WASTED_O  0x01
9155 #define WASTED_G  0x02
9156 #define WASTED_C  0x04
9157 #define WASTED_GC (WASTED_G|WASTED_C)
9158     I32 wastedflags = 0x00;
9159     U32 posflags = 0, negflags = 0;
9160     U32 *flagsp = &posflags;
9161     char has_charset_modifier = '\0';
9162     regex_charset cs;
9163     bool has_use_defaults = FALSE;
9164     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9165
9166     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9167
9168     /* '^' as an initial flag sets certain defaults */
9169     if (UCHARAT(RExC_parse) == '^') {
9170         RExC_parse++;
9171         has_use_defaults = TRUE;
9172         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9173         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9174                                         ? REGEX_UNICODE_CHARSET
9175                                         : REGEX_DEPENDS_CHARSET);
9176     }
9177
9178     cs = get_regex_charset(RExC_flags);
9179     if (cs == REGEX_DEPENDS_CHARSET
9180         && (RExC_utf8 || RExC_uni_semantics))
9181     {
9182         cs = REGEX_UNICODE_CHARSET;
9183     }
9184
9185     while (*RExC_parse) {
9186         /* && strchr("iogcmsx", *RExC_parse) */
9187         /* (?g), (?gc) and (?o) are useless here
9188            and must be globally applied -- japhy */
9189         switch (*RExC_parse) {
9190
9191             /* Code for the imsx flags */
9192             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9193
9194             case LOCALE_PAT_MOD:
9195                 if (has_charset_modifier) {
9196                     goto excess_modifier;
9197                 }
9198                 else if (flagsp == &negflags) {
9199                     goto neg_modifier;
9200                 }
9201                 cs = REGEX_LOCALE_CHARSET;
9202                 has_charset_modifier = LOCALE_PAT_MOD;
9203                 break;
9204             case UNICODE_PAT_MOD:
9205                 if (has_charset_modifier) {
9206                     goto excess_modifier;
9207                 }
9208                 else if (flagsp == &negflags) {
9209                     goto neg_modifier;
9210                 }
9211                 cs = REGEX_UNICODE_CHARSET;
9212                 has_charset_modifier = UNICODE_PAT_MOD;
9213                 break;
9214             case ASCII_RESTRICT_PAT_MOD:
9215                 if (flagsp == &negflags) {
9216                     goto neg_modifier;
9217                 }
9218                 if (has_charset_modifier) {
9219                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9220                         goto excess_modifier;
9221                     }
9222                     /* Doubled modifier implies more restricted */
9223                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9224                 }
9225                 else {
9226                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9227                 }
9228                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9229                 break;
9230             case DEPENDS_PAT_MOD:
9231                 if (has_use_defaults) {
9232                     goto fail_modifiers;
9233                 }
9234                 else if (flagsp == &negflags) {
9235                     goto neg_modifier;
9236                 }
9237                 else if (has_charset_modifier) {
9238                     goto excess_modifier;
9239                 }
9240
9241                 /* The dual charset means unicode semantics if the
9242                  * pattern (or target, not known until runtime) are
9243                  * utf8, or something in the pattern indicates unicode
9244                  * semantics */
9245                 cs = (RExC_utf8 || RExC_uni_semantics)
9246                      ? REGEX_UNICODE_CHARSET
9247                      : REGEX_DEPENDS_CHARSET;
9248                 has_charset_modifier = DEPENDS_PAT_MOD;
9249                 break;
9250             excess_modifier:
9251                 RExC_parse++;
9252                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9253                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9254                 }
9255                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9256                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9257                                         *(RExC_parse - 1));
9258                 }
9259                 else {
9260                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9261                 }
9262                 /*NOTREACHED*/
9263             neg_modifier:
9264                 RExC_parse++;
9265                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9266                                     *(RExC_parse - 1));
9267                 /*NOTREACHED*/
9268             case ONCE_PAT_MOD: /* 'o' */
9269             case GLOBAL_PAT_MOD: /* 'g' */
9270                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9271                     const I32 wflagbit = *RExC_parse == 'o'
9272                                          ? WASTED_O
9273                                          : WASTED_G;
9274                     if (! (wastedflags & wflagbit) ) {
9275                         wastedflags |= wflagbit;
9276                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9277                         vWARN5(
9278                             RExC_parse + 1,
9279                             "Useless (%s%c) - %suse /%c modifier",
9280                             flagsp == &negflags ? "?-" : "?",
9281                             *RExC_parse,
9282                             flagsp == &negflags ? "don't " : "",
9283                             *RExC_parse
9284                         );
9285                     }
9286                 }
9287                 break;
9288
9289             case CONTINUE_PAT_MOD: /* 'c' */
9290                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9291                     if (! (wastedflags & WASTED_C) ) {
9292                         wastedflags |= WASTED_GC;
9293                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9294                         vWARN3(
9295                             RExC_parse + 1,
9296                             "Useless (%sc) - %suse /gc modifier",
9297                             flagsp == &negflags ? "?-" : "?",
9298                             flagsp == &negflags ? "don't " : ""
9299                         );
9300                     }
9301                 }
9302                 break;
9303             case KEEPCOPY_PAT_MOD: /* 'p' */
9304                 if (flagsp == &negflags) {
9305                     if (SIZE_ONLY)
9306                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9307                 } else {
9308                     *flagsp |= RXf_PMf_KEEPCOPY;
9309                 }
9310                 break;
9311             case '-':
9312                 /* A flag is a default iff it is following a minus, so
9313                  * if there is a minus, it means will be trying to
9314                  * re-specify a default which is an error */
9315                 if (has_use_defaults || flagsp == &negflags) {
9316                     goto fail_modifiers;
9317                 }
9318                 flagsp = &negflags;
9319                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9320                 break;
9321             case ':':
9322             case ')':
9323                 RExC_flags |= posflags;
9324                 RExC_flags &= ~negflags;
9325                 set_regex_charset(&RExC_flags, cs);
9326                 if (RExC_flags & RXf_PMf_FOLD) {
9327                     RExC_contains_i = 1;
9328                 }
9329                 return;
9330                 /*NOTREACHED*/
9331             default:
9332             fail_modifiers:
9333                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9334                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9335                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9336                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9337                 /*NOTREACHED*/
9338         }
9339
9340         ++RExC_parse;
9341     }
9342 }
9343
9344 /*
9345  - reg - regular expression, i.e. main body or parenthesized thing
9346  *
9347  * Caller must absorb opening parenthesis.
9348  *
9349  * Combining parenthesis handling with the base level of regular expression
9350  * is a trifle forced, but the need to tie the tails of the branches to what
9351  * follows makes it hard to avoid.
9352  */
9353 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9354 #ifdef DEBUGGING
9355 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9356 #else
9357 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9358 #endif
9359
9360 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9361    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9362    needs to be restarted.
9363    Otherwise would only return NULL if regbranch() returns NULL, which
9364    cannot happen.  */
9365 STATIC regnode *
9366 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9367     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9368      * 2 is like 1, but indicates that nextchar() has been called to advance
9369      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9370      * this flag alerts us to the need to check for that */
9371 {
9372     dVAR;
9373     regnode *ret;               /* Will be the head of the group. */
9374     regnode *br;
9375     regnode *lastbr;
9376     regnode *ender = NULL;
9377     I32 parno = 0;
9378     I32 flags;
9379     U32 oregflags = RExC_flags;
9380     bool have_branch = 0;
9381     bool is_open = 0;
9382     I32 freeze_paren = 0;
9383     I32 after_freeze = 0;
9384
9385     char * parse_start = RExC_parse; /* MJD */
9386     char * const oregcomp_parse = RExC_parse;
9387
9388     GET_RE_DEBUG_FLAGS_DECL;
9389
9390     PERL_ARGS_ASSERT_REG;
9391     DEBUG_PARSE("reg ");
9392
9393     *flagp = 0;                         /* Tentatively. */
9394
9395
9396     /* Make an OPEN node, if parenthesized. */
9397     if (paren) {
9398
9399         /* Under /x, space and comments can be gobbled up between the '(' and
9400          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9401          * intervening space, as the sequence is a token, and a token should be
9402          * indivisible */
9403         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9404
9405         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9406             char *start_verb = RExC_parse;
9407             STRLEN verb_len = 0;
9408             char *start_arg = NULL;
9409             unsigned char op = 0;
9410             int argok = 1;
9411             int internal_argval = 0; /* internal_argval is only useful if
9412                                         !argok */
9413
9414             if (has_intervening_patws && SIZE_ONLY) {
9415                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9416             }
9417             while ( *RExC_parse && *RExC_parse != ')' ) {
9418                 if ( *RExC_parse == ':' ) {
9419                     start_arg = RExC_parse + 1;
9420                     break;
9421                 }
9422                 RExC_parse++;
9423             }
9424             ++start_verb;
9425             verb_len = RExC_parse - start_verb;
9426             if ( start_arg ) {
9427                 RExC_parse++;
9428                 while ( *RExC_parse && *RExC_parse != ')' )
9429                     RExC_parse++;
9430                 if ( *RExC_parse != ')' )
9431                     vFAIL("Unterminated verb pattern argument");
9432                 if ( RExC_parse == start_arg )
9433                     start_arg = NULL;
9434             } else {
9435                 if ( *RExC_parse != ')' )
9436                     vFAIL("Unterminated verb pattern");
9437             }
9438
9439             switch ( *start_verb ) {
9440             case 'A':  /* (*ACCEPT) */
9441                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9442                     op = ACCEPT;
9443                     internal_argval = RExC_nestroot;
9444                 }
9445                 break;
9446             case 'C':  /* (*COMMIT) */
9447                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9448                     op = COMMIT;
9449                 break;
9450             case 'F':  /* (*FAIL) */
9451                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9452                     op = OPFAIL;
9453                     argok = 0;
9454                 }
9455                 break;
9456             case ':':  /* (*:NAME) */
9457             case 'M':  /* (*MARK:NAME) */
9458                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9459                     op = MARKPOINT;
9460                     argok = -1;
9461                 }
9462                 break;
9463             case 'P':  /* (*PRUNE) */
9464                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9465                     op = PRUNE;
9466                 break;
9467             case 'S':   /* (*SKIP) */
9468                 if ( memEQs(start_verb,verb_len,"SKIP") )
9469                     op = SKIP;
9470                 break;
9471             case 'T':  /* (*THEN) */
9472                 /* [19:06] <TimToady> :: is then */
9473                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9474                     op = CUTGROUP;
9475                     RExC_seen |= REG_CUTGROUP_SEEN;
9476                 }
9477                 break;
9478             }
9479             if ( ! op ) {
9480                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9481                 vFAIL2utf8f(
9482                     "Unknown verb pattern '%"UTF8f"'",
9483                     UTF8fARG(UTF, verb_len, start_verb));
9484             }
9485             if ( argok ) {
9486                 if ( start_arg && internal_argval ) {
9487                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9488                         verb_len, start_verb);
9489                 } else if ( argok < 0 && !start_arg ) {
9490                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9491                         verb_len, start_verb);
9492                 } else {
9493                     ret = reganode(pRExC_state, op, internal_argval);
9494                     if ( ! internal_argval && ! SIZE_ONLY ) {
9495                         if (start_arg) {
9496                             SV *sv = newSVpvn( start_arg,
9497                                                RExC_parse - start_arg);
9498                             ARG(ret) = add_data( pRExC_state,
9499                                                  STR_WITH_LEN("S"));
9500                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9501                             ret->flags = 0;
9502                         } else {
9503                             ret->flags = 1;
9504                         }
9505                     }
9506                 }
9507                 if (!internal_argval)
9508                     RExC_seen |= REG_VERBARG_SEEN;
9509             } else if ( start_arg ) {
9510                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9511                         verb_len, start_verb);
9512             } else {
9513                 ret = reg_node(pRExC_state, op);
9514             }
9515             nextchar(pRExC_state);
9516             return ret;
9517         }
9518         else if (*RExC_parse == '?') { /* (?...) */
9519             bool is_logical = 0;
9520             const char * const seqstart = RExC_parse;
9521             if (has_intervening_patws && SIZE_ONLY) {
9522                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9523             }
9524
9525             RExC_parse++;
9526             paren = *RExC_parse++;
9527             ret = NULL;                 /* For look-ahead/behind. */
9528             switch (paren) {
9529
9530             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9531                 paren = *RExC_parse++;
9532                 if ( paren == '<')         /* (?P<...>) named capture */
9533                     goto named_capture;
9534                 else if (paren == '>') {   /* (?P>name) named recursion */
9535                     goto named_recursion;
9536                 }
9537                 else if (paren == '=') {   /* (?P=...)  named backref */
9538                     /* this pretty much dupes the code for \k<NAME> in
9539                      * regatom(), if you change this make sure you change that
9540                      * */
9541                     char* name_start = RExC_parse;
9542                     U32 num = 0;
9543                     SV *sv_dat = reg_scan_name(pRExC_state,
9544                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9545                     if (RExC_parse == name_start || *RExC_parse != ')')
9546                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9547                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9548
9549                     if (!SIZE_ONLY) {
9550                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9551                         RExC_rxi->data->data[num]=(void*)sv_dat;
9552                         SvREFCNT_inc_simple_void(sv_dat);
9553                     }
9554                     RExC_sawback = 1;
9555                     ret = reganode(pRExC_state,
9556                                    ((! FOLD)
9557                                      ? NREF
9558                                      : (ASCII_FOLD_RESTRICTED)
9559                                        ? NREFFA
9560                                        : (AT_LEAST_UNI_SEMANTICS)
9561                                          ? NREFFU
9562                                          : (LOC)
9563                                            ? NREFFL
9564                                            : NREFF),
9565                                     num);
9566                     *flagp |= HASWIDTH;
9567
9568                     Set_Node_Offset(ret, parse_start+1);
9569                     Set_Node_Cur_Length(ret, parse_start);
9570
9571                     nextchar(pRExC_state);
9572                     return ret;
9573                 }
9574                 RExC_parse++;
9575                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9576                 vFAIL3("Sequence (%.*s...) not recognized",
9577                                 RExC_parse-seqstart, seqstart);
9578                 /*NOTREACHED*/
9579             case '<':           /* (?<...) */
9580                 if (*RExC_parse == '!')
9581                     paren = ',';
9582                 else if (*RExC_parse != '=')
9583               named_capture:
9584                 {               /* (?<...>) */
9585                     char *name_start;
9586                     SV *svname;
9587                     paren= '>';
9588             case '\'':          /* (?'...') */
9589                     name_start= RExC_parse;
9590                     svname = reg_scan_name(pRExC_state,
9591                         SIZE_ONLY    /* reverse test from the others */
9592                         ? REG_RSN_RETURN_NAME
9593                         : REG_RSN_RETURN_NULL);
9594                     if (RExC_parse == name_start || *RExC_parse != paren)
9595                         vFAIL2("Sequence (?%c... not terminated",
9596                             paren=='>' ? '<' : paren);
9597                     if (SIZE_ONLY) {
9598                         HE *he_str;
9599                         SV *sv_dat = NULL;
9600                         if (!svname) /* shouldn't happen */
9601                             Perl_croak(aTHX_
9602                                 "panic: reg_scan_name returned NULL");
9603                         if (!RExC_paren_names) {
9604                             RExC_paren_names= newHV();
9605                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9606 #ifdef DEBUGGING
9607                             RExC_paren_name_list= newAV();
9608                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9609 #endif
9610                         }
9611                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9612                         if ( he_str )
9613                             sv_dat = HeVAL(he_str);
9614                         if ( ! sv_dat ) {
9615                             /* croak baby croak */
9616                             Perl_croak(aTHX_
9617                                 "panic: paren_name hash element allocation failed");
9618                         } else if ( SvPOK(sv_dat) ) {
9619                             /* (?|...) can mean we have dupes so scan to check
9620                                its already been stored. Maybe a flag indicating
9621                                we are inside such a construct would be useful,
9622                                but the arrays are likely to be quite small, so
9623                                for now we punt -- dmq */
9624                             IV count = SvIV(sv_dat);
9625                             I32 *pv = (I32*)SvPVX(sv_dat);
9626                             IV i;
9627                             for ( i = 0 ; i < count ; i++ ) {
9628                                 if ( pv[i] == RExC_npar ) {
9629                                     count = 0;
9630                                     break;
9631                                 }
9632                             }
9633                             if ( count ) {
9634                                 pv = (I32*)SvGROW(sv_dat,
9635                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9636                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9637                                 pv[count] = RExC_npar;
9638                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9639                             }
9640                         } else {
9641                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9642                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9643                                                                 sizeof(I32));
9644                             SvIOK_on(sv_dat);
9645                             SvIV_set(sv_dat, 1);
9646                         }
9647 #ifdef DEBUGGING
9648                         /* Yes this does cause a memory leak in debugging Perls
9649                          * */
9650                         if (!av_store(RExC_paren_name_list,
9651                                       RExC_npar, SvREFCNT_inc(svname)))
9652                             SvREFCNT_dec_NN(svname);
9653 #endif
9654
9655                         /*sv_dump(sv_dat);*/
9656                     }
9657                     nextchar(pRExC_state);
9658                     paren = 1;
9659                     goto capturing_parens;
9660                 }
9661                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9662                 RExC_in_lookbehind++;
9663                 RExC_parse++;
9664                 /* FALLTHROUGH */
9665             case '=':           /* (?=...) */
9666                 RExC_seen_zerolen++;
9667                 break;
9668             case '!':           /* (?!...) */
9669                 RExC_seen_zerolen++;
9670                 if (*RExC_parse == ')') {
9671                     ret=reg_node(pRExC_state, OPFAIL);
9672                     nextchar(pRExC_state);
9673                     return ret;
9674                 }
9675                 break;
9676             case '|':           /* (?|...) */
9677                 /* branch reset, behave like a (?:...) except that
9678                    buffers in alternations share the same numbers */
9679                 paren = ':';
9680                 after_freeze = freeze_paren = RExC_npar;
9681                 break;
9682             case ':':           /* (?:...) */
9683             case '>':           /* (?>...) */
9684                 break;
9685             case '$':           /* (?$...) */
9686             case '@':           /* (?@...) */
9687                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9688                 break;
9689             case '#':           /* (?#...) */
9690                 /* XXX As soon as we disallow separating the '?' and '*' (by
9691                  * spaces or (?#...) comment), it is believed that this case
9692                  * will be unreachable and can be removed.  See
9693                  * [perl #117327] */
9694                 while (*RExC_parse && *RExC_parse != ')')
9695                     RExC_parse++;
9696                 if (*RExC_parse != ')')
9697                     FAIL("Sequence (?#... not terminated");
9698                 nextchar(pRExC_state);
9699                 *flagp = TRYAGAIN;
9700                 return NULL;
9701             case '0' :           /* (?0) */
9702             case 'R' :           /* (?R) */
9703                 if (*RExC_parse != ')')
9704                     FAIL("Sequence (?R) not terminated");
9705                 ret = reg_node(pRExC_state, GOSTART);
9706                     RExC_seen |= REG_GOSTART_SEEN;
9707                 *flagp |= POSTPONED;
9708                 nextchar(pRExC_state);
9709                 return ret;
9710                 /*notreached*/
9711             { /* named and numeric backreferences */
9712                 I32 num;
9713             case '&':            /* (?&NAME) */
9714                 parse_start = RExC_parse - 1;
9715               named_recursion:
9716                 {
9717                     SV *sv_dat = reg_scan_name(pRExC_state,
9718                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9719                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9720                 }
9721                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9722                     vFAIL("Sequence (?&... not terminated");
9723                 goto gen_recurse_regop;
9724                 assert(0); /* NOT REACHED */
9725             case '+':
9726                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9727                     RExC_parse++;
9728                     vFAIL("Illegal pattern");
9729                 }
9730                 goto parse_recursion;
9731                 /* NOT REACHED*/
9732             case '-': /* (?-1) */
9733                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9734                     RExC_parse--; /* rewind to let it be handled later */
9735                     goto parse_flags;
9736                 }
9737                 /*FALLTHROUGH */
9738             case '1': case '2': case '3': case '4': /* (?1) */
9739             case '5': case '6': case '7': case '8': case '9':
9740                 RExC_parse--;
9741               parse_recursion:
9742                 num = atoi(RExC_parse);
9743                 parse_start = RExC_parse - 1; /* MJD */
9744                 if (*RExC_parse == '-')
9745                     RExC_parse++;
9746                 while (isDIGIT(*RExC_parse))
9747                         RExC_parse++;
9748                 if (*RExC_parse!=')')
9749                     vFAIL("Expecting close bracket");
9750
9751               gen_recurse_regop:
9752                 if ( paren == '-' ) {
9753                     /*
9754                     Diagram of capture buffer numbering.
9755                     Top line is the normal capture buffer numbers
9756                     Bottom line is the negative indexing as from
9757                     the X (the (?-2))
9758
9759                     +   1 2    3 4 5 X          6 7
9760                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9761                     -   5 4    3 2 1 X          x x
9762
9763                     */
9764                     num = RExC_npar + num;
9765                     if (num < 1)  {
9766                         RExC_parse++;
9767                         vFAIL("Reference to nonexistent group");
9768                     }
9769                 } else if ( paren == '+' ) {
9770                     num = RExC_npar + num - 1;
9771                 }
9772
9773                 ret = reganode(pRExC_state, GOSUB, num);
9774                 if (!SIZE_ONLY) {
9775                     if (num > (I32)RExC_rx->nparens) {
9776                         RExC_parse++;
9777                         vFAIL("Reference to nonexistent group");
9778                     }
9779                     ARG2L_SET( ret, RExC_recurse_count++);
9780                     RExC_emit++;
9781                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9782                         "Recurse #%"UVuf" to %"IVdf"\n",
9783                               (UV)ARG(ret), (IV)ARG2L(ret)));
9784                 } else {
9785                     RExC_size++;
9786                 }
9787                     RExC_seen |= REG_RECURSE_SEEN;
9788                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9789                 Set_Node_Offset(ret, parse_start); /* MJD */
9790
9791                 *flagp |= POSTPONED;
9792                 nextchar(pRExC_state);
9793                 return ret;
9794             } /* named and numeric backreferences */
9795             assert(0); /* NOT REACHED */
9796
9797             case '?':           /* (??...) */
9798                 is_logical = 1;
9799                 if (*RExC_parse != '{') {
9800                     RExC_parse++;
9801                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9802                     vFAIL2utf8f(
9803                         "Sequence (%"UTF8f"...) not recognized",
9804                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9805                     /*NOTREACHED*/
9806                 }
9807                 *flagp |= POSTPONED;
9808                 paren = *RExC_parse++;
9809                 /* FALL THROUGH */
9810             case '{':           /* (?{...}) */
9811             {
9812                 U32 n = 0;
9813                 struct reg_code_block *cb;
9814
9815                 RExC_seen_zerolen++;
9816
9817                 if (   !pRExC_state->num_code_blocks
9818                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9819                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9820                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9821                             - RExC_start)
9822                 ) {
9823                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9824                         FAIL("panic: Sequence (?{...}): no code block found\n");
9825                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9826                 }
9827                 /* this is a pre-compiled code block (?{...}) */
9828                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9829                 RExC_parse = RExC_start + cb->end;
9830                 if (!SIZE_ONLY) {
9831                     OP *o = cb->block;
9832                     if (cb->src_regex) {
9833                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9834                         RExC_rxi->data->data[n] =
9835                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9836                         RExC_rxi->data->data[n+1] = (void*)o;
9837                     }
9838                     else {
9839                         n = add_data(pRExC_state,
9840                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9841                         RExC_rxi->data->data[n] = (void*)o;
9842                     }
9843                 }
9844                 pRExC_state->code_index++;
9845                 nextchar(pRExC_state);
9846
9847                 if (is_logical) {
9848                     regnode *eval;
9849                     ret = reg_node(pRExC_state, LOGICAL);
9850                     eval = reganode(pRExC_state, EVAL, n);
9851                     if (!SIZE_ONLY) {
9852                         ret->flags = 2;
9853                         /* for later propagation into (??{}) return value */
9854                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9855                     }
9856                     REGTAIL(pRExC_state, ret, eval);
9857                     /* deal with the length of this later - MJD */
9858                     return ret;
9859                 }
9860                 ret = reganode(pRExC_state, EVAL, n);
9861                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9862                 Set_Node_Offset(ret, parse_start);
9863                 return ret;
9864             }
9865             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9866             {
9867                 int is_define= 0;
9868                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9869                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9870                         || RExC_parse[1] == '<'
9871                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9872                         I32 flag;
9873                         regnode *tail;
9874
9875                         ret = reg_node(pRExC_state, LOGICAL);
9876                         if (!SIZE_ONLY)
9877                             ret->flags = 1;
9878
9879                         tail = reg(pRExC_state, 1, &flag, depth+1);
9880                         if (flag & RESTART_UTF8) {
9881                             *flagp = RESTART_UTF8;
9882                             return NULL;
9883                         }
9884                         REGTAIL(pRExC_state, ret, tail);
9885                         goto insert_if;
9886                     }
9887                 }
9888                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9889                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9890                 {
9891                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9892                     char *name_start= RExC_parse++;
9893                     U32 num = 0;
9894                     SV *sv_dat=reg_scan_name(pRExC_state,
9895                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9896                     if (RExC_parse == name_start || *RExC_parse != ch)
9897                         vFAIL2("Sequence (?(%c... not terminated",
9898                             (ch == '>' ? '<' : ch));
9899                     RExC_parse++;
9900                     if (!SIZE_ONLY) {
9901                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9902                         RExC_rxi->data->data[num]=(void*)sv_dat;
9903                         SvREFCNT_inc_simple_void(sv_dat);
9904                     }
9905                     ret = reganode(pRExC_state,NGROUPP,num);
9906                     goto insert_if_check_paren;
9907                 }
9908                 else if (RExC_parse[0] == 'D' &&
9909                          RExC_parse[1] == 'E' &&
9910                          RExC_parse[2] == 'F' &&
9911                          RExC_parse[3] == 'I' &&
9912                          RExC_parse[4] == 'N' &&
9913                          RExC_parse[5] == 'E')
9914                 {
9915                     ret = reganode(pRExC_state,DEFINEP,0);
9916                     RExC_parse +=6 ;
9917                     is_define = 1;
9918                     goto insert_if_check_paren;
9919                 }
9920                 else if (RExC_parse[0] == 'R') {
9921                     RExC_parse++;
9922                     parno = 0;
9923                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9924                         parno = atoi(RExC_parse++);
9925                         while (isDIGIT(*RExC_parse))
9926                             RExC_parse++;
9927                     } else if (RExC_parse[0] == '&') {
9928                         SV *sv_dat;
9929                         RExC_parse++;
9930                         sv_dat = reg_scan_name(pRExC_state,
9931                             SIZE_ONLY
9932                             ? REG_RSN_RETURN_NULL
9933                             : REG_RSN_RETURN_DATA);
9934                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9935                     }
9936                     ret = reganode(pRExC_state,INSUBP,parno);
9937                     goto insert_if_check_paren;
9938                 }
9939                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9940                     /* (?(1)...) */
9941                     char c;
9942                     char *tmp;
9943                     parno = atoi(RExC_parse++);
9944
9945                     while (isDIGIT(*RExC_parse))
9946                         RExC_parse++;
9947                     ret = reganode(pRExC_state, GROUPP, parno);
9948
9949                  insert_if_check_paren:
9950                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9951                         /* nextchar also skips comments, so undo its work
9952                          * and skip over the the next character.
9953                          */
9954                         RExC_parse = tmp;
9955                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9956                         vFAIL("Switch condition not recognized");
9957                     }
9958                   insert_if:
9959                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9960                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9961                     if (br == NULL) {
9962                         if (flags & RESTART_UTF8) {
9963                             *flagp = RESTART_UTF8;
9964                             return NULL;
9965                         }
9966                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9967                               (UV) flags);
9968                     } else
9969                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9970                                                           LONGJMP, 0));
9971                     c = *nextchar(pRExC_state);
9972                     if (flags&HASWIDTH)
9973                         *flagp |= HASWIDTH;
9974                     if (c == '|') {
9975                         if (is_define)
9976                             vFAIL("(?(DEFINE)....) does not allow branches");
9977
9978                         /* Fake one for optimizer.  */
9979                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9980
9981                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9982                             if (flags & RESTART_UTF8) {
9983                                 *flagp = RESTART_UTF8;
9984                                 return NULL;
9985                             }
9986                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9987                                   (UV) flags);
9988                         }
9989                         REGTAIL(pRExC_state, ret, lastbr);
9990                         if (flags&HASWIDTH)
9991                             *flagp |= HASWIDTH;
9992                         c = *nextchar(pRExC_state);
9993                     }
9994                     else
9995                         lastbr = NULL;
9996                     if (c != ')')
9997                         vFAIL("Switch (?(condition)... contains too many branches");
9998                     ender = reg_node(pRExC_state, TAIL);
9999                     REGTAIL(pRExC_state, br, ender);
10000                     if (lastbr) {
10001                         REGTAIL(pRExC_state, lastbr, ender);
10002                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10003                     }
10004                     else
10005                         REGTAIL(pRExC_state, ret, ender);
10006                     RExC_size++; /* XXX WHY do we need this?!!
10007                                     For large programs it seems to be required
10008                                     but I can't figure out why. -- dmq*/
10009                     return ret;
10010                 }
10011                 else {
10012                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10013                     vFAIL("Unknown switch condition (?(...))");
10014                 }
10015             }
10016             case '[':           /* (?[ ... ]) */
10017                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10018                                          oregcomp_parse);
10019             case 0:
10020                 RExC_parse--; /* for vFAIL to print correctly */
10021                 vFAIL("Sequence (? incomplete");
10022                 break;
10023             default: /* e.g., (?i) */
10024                 --RExC_parse;
10025               parse_flags:
10026                 parse_lparen_question_flags(pRExC_state);
10027                 if (UCHARAT(RExC_parse) != ':') {
10028                     nextchar(pRExC_state);
10029                     *flagp = TRYAGAIN;
10030                     return NULL;
10031                 }
10032                 paren = ':';
10033                 nextchar(pRExC_state);
10034                 ret = NULL;
10035                 goto parse_rest;
10036             } /* end switch */
10037         }
10038         else {                  /* (...) */
10039           capturing_parens:
10040             parno = RExC_npar;
10041             RExC_npar++;
10042
10043             ret = reganode(pRExC_state, OPEN, parno);
10044             if (!SIZE_ONLY ){
10045                 if (!RExC_nestroot)
10046                     RExC_nestroot = parno;
10047                 if (RExC_seen & REG_RECURSE_SEEN
10048                     && !RExC_open_parens[parno-1])
10049                 {
10050                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10051                         "Setting open paren #%"IVdf" to %d\n",
10052                         (IV)parno, REG_NODE_NUM(ret)));
10053                     RExC_open_parens[parno-1]= ret;
10054                 }
10055             }
10056             Set_Node_Length(ret, 1); /* MJD */
10057             Set_Node_Offset(ret, RExC_parse); /* MJD */
10058             is_open = 1;
10059         }
10060     }
10061     else                        /* ! paren */
10062         ret = NULL;
10063
10064    parse_rest:
10065     /* Pick up the branches, linking them together. */
10066     parse_start = RExC_parse;   /* MJD */
10067     br = regbranch(pRExC_state, &flags, 1,depth+1);
10068
10069     /*     branch_len = (paren != 0); */
10070
10071     if (br == NULL) {
10072         if (flags & RESTART_UTF8) {
10073             *flagp = RESTART_UTF8;
10074             return NULL;
10075         }
10076         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10077     }
10078     if (*RExC_parse == '|') {
10079         if (!SIZE_ONLY && RExC_extralen) {
10080             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10081         }
10082         else {                  /* MJD */
10083             reginsert(pRExC_state, BRANCH, br, depth+1);
10084             Set_Node_Length(br, paren != 0);
10085             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10086         }
10087         have_branch = 1;
10088         if (SIZE_ONLY)
10089             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10090     }
10091     else if (paren == ':') {
10092         *flagp |= flags&SIMPLE;
10093     }
10094     if (is_open) {                              /* Starts with OPEN. */
10095         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10096     }
10097     else if (paren != '?')              /* Not Conditional */
10098         ret = br;
10099     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10100     lastbr = br;
10101     while (*RExC_parse == '|') {
10102         if (!SIZE_ONLY && RExC_extralen) {
10103             ender = reganode(pRExC_state, LONGJMP,0);
10104
10105             /* Append to the previous. */
10106             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10107         }
10108         if (SIZE_ONLY)
10109             RExC_extralen += 2;         /* Account for LONGJMP. */
10110         nextchar(pRExC_state);
10111         if (freeze_paren) {
10112             if (RExC_npar > after_freeze)
10113                 after_freeze = RExC_npar;
10114             RExC_npar = freeze_paren;
10115         }
10116         br = regbranch(pRExC_state, &flags, 0, depth+1);
10117
10118         if (br == NULL) {
10119             if (flags & RESTART_UTF8) {
10120                 *flagp = RESTART_UTF8;
10121                 return NULL;
10122             }
10123             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10124         }
10125         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10126         lastbr = br;
10127         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10128     }
10129
10130     if (have_branch || paren != ':') {
10131         /* Make a closing node, and hook it on the end. */
10132         switch (paren) {
10133         case ':':
10134             ender = reg_node(pRExC_state, TAIL);
10135             break;
10136         case 1: case 2:
10137             ender = reganode(pRExC_state, CLOSE, parno);
10138             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10139                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10140                         "Setting close paren #%"IVdf" to %d\n",
10141                         (IV)parno, REG_NODE_NUM(ender)));
10142                 RExC_close_parens[parno-1]= ender;
10143                 if (RExC_nestroot == parno)
10144                     RExC_nestroot = 0;
10145             }
10146             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10147             Set_Node_Length(ender,1); /* MJD */
10148             break;
10149         case '<':
10150         case ',':
10151         case '=':
10152         case '!':
10153             *flagp &= ~HASWIDTH;
10154             /* FALL THROUGH */
10155         case '>':
10156             ender = reg_node(pRExC_state, SUCCEED);
10157             break;
10158         case 0:
10159             ender = reg_node(pRExC_state, END);
10160             if (!SIZE_ONLY) {
10161                 assert(!RExC_opend); /* there can only be one! */
10162                 RExC_opend = ender;
10163             }
10164             break;
10165         }
10166         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10167             SV * const mysv_val1=sv_newmortal();
10168             SV * const mysv_val2=sv_newmortal();
10169             DEBUG_PARSE_MSG("lsbr");
10170             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10171             regprop(RExC_rx, mysv_val2, ender, NULL);
10172             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10173                           SvPV_nolen_const(mysv_val1),
10174                           (IV)REG_NODE_NUM(lastbr),
10175                           SvPV_nolen_const(mysv_val2),
10176                           (IV)REG_NODE_NUM(ender),
10177                           (IV)(ender - lastbr)
10178             );
10179         });
10180         REGTAIL(pRExC_state, lastbr, ender);
10181
10182         if (have_branch && !SIZE_ONLY) {
10183             char is_nothing= 1;
10184             if (depth==1)
10185                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10186
10187             /* Hook the tails of the branches to the closing node. */
10188             for (br = ret; br; br = regnext(br)) {
10189                 const U8 op = PL_regkind[OP(br)];
10190                 if (op == BRANCH) {
10191                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10192                     if ( OP(NEXTOPER(br)) != NOTHING
10193                          || regnext(NEXTOPER(br)) != ender)
10194                         is_nothing= 0;
10195                 }
10196                 else if (op == BRANCHJ) {
10197                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10198                     /* for now we always disable this optimisation * /
10199                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10200                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10201                     */
10202                         is_nothing= 0;
10203                 }
10204             }
10205             if (is_nothing) {
10206                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10207                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10208                     SV * const mysv_val1=sv_newmortal();
10209                     SV * const mysv_val2=sv_newmortal();
10210                     DEBUG_PARSE_MSG("NADA");
10211                     regprop(RExC_rx, mysv_val1, ret, NULL);
10212                     regprop(RExC_rx, mysv_val2, ender, NULL);
10213                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10214                                   SvPV_nolen_const(mysv_val1),
10215                                   (IV)REG_NODE_NUM(ret),
10216                                   SvPV_nolen_const(mysv_val2),
10217                                   (IV)REG_NODE_NUM(ender),
10218                                   (IV)(ender - ret)
10219                     );
10220                 });
10221                 OP(br)= NOTHING;
10222                 if (OP(ender) == TAIL) {
10223                     NEXT_OFF(br)= 0;
10224                     RExC_emit= br + 1;
10225                 } else {
10226                     regnode *opt;
10227                     for ( opt= br + 1; opt < ender ; opt++ )
10228                         OP(opt)= OPTIMIZED;
10229                     NEXT_OFF(br)= ender - br;
10230                 }
10231             }
10232         }
10233     }
10234
10235     {
10236         const char *p;
10237         static const char parens[] = "=!<,>";
10238
10239         if (paren && (p = strchr(parens, paren))) {
10240             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10241             int flag = (p - parens) > 1;
10242
10243             if (paren == '>')
10244                 node = SUSPEND, flag = 0;
10245             reginsert(pRExC_state, node,ret, depth+1);
10246             Set_Node_Cur_Length(ret, parse_start);
10247             Set_Node_Offset(ret, parse_start + 1);
10248             ret->flags = flag;
10249             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10250         }
10251     }
10252
10253     /* Check for proper termination. */
10254     if (paren) {
10255         /* restore original flags, but keep (?p) */
10256         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10257         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10258             RExC_parse = oregcomp_parse;
10259             vFAIL("Unmatched (");
10260         }
10261     }
10262     else if (!paren && RExC_parse < RExC_end) {
10263         if (*RExC_parse == ')') {
10264             RExC_parse++;
10265             vFAIL("Unmatched )");
10266         }
10267         else
10268             FAIL("Junk on end of regexp");      /* "Can't happen". */
10269         assert(0); /* NOTREACHED */
10270     }
10271
10272     if (RExC_in_lookbehind) {
10273         RExC_in_lookbehind--;
10274     }
10275     if (after_freeze > RExC_npar)
10276         RExC_npar = after_freeze;
10277     return(ret);
10278 }
10279
10280 /*
10281  - regbranch - one alternative of an | operator
10282  *
10283  * Implements the concatenation operator.
10284  *
10285  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10286  * restarted.
10287  */
10288 STATIC regnode *
10289 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10290 {
10291     dVAR;
10292     regnode *ret;
10293     regnode *chain = NULL;
10294     regnode *latest;
10295     I32 flags = 0, c = 0;
10296     GET_RE_DEBUG_FLAGS_DECL;
10297
10298     PERL_ARGS_ASSERT_REGBRANCH;
10299
10300     DEBUG_PARSE("brnc");
10301
10302     if (first)
10303         ret = NULL;
10304     else {
10305         if (!SIZE_ONLY && RExC_extralen)
10306             ret = reganode(pRExC_state, BRANCHJ,0);
10307         else {
10308             ret = reg_node(pRExC_state, BRANCH);
10309             Set_Node_Length(ret, 1);
10310         }
10311     }
10312
10313     if (!first && SIZE_ONLY)
10314         RExC_extralen += 1;                     /* BRANCHJ */
10315
10316     *flagp = WORST;                     /* Tentatively. */
10317
10318     RExC_parse--;
10319     nextchar(pRExC_state);
10320     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10321         flags &= ~TRYAGAIN;
10322         latest = regpiece(pRExC_state, &flags,depth+1);
10323         if (latest == NULL) {
10324             if (flags & TRYAGAIN)
10325                 continue;
10326             if (flags & RESTART_UTF8) {
10327                 *flagp = RESTART_UTF8;
10328                 return NULL;
10329             }
10330             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10331         }
10332         else if (ret == NULL)
10333             ret = latest;
10334         *flagp |= flags&(HASWIDTH|POSTPONED);
10335         if (chain == NULL)      /* First piece. */
10336             *flagp |= flags&SPSTART;
10337         else {
10338             RExC_naughty++;
10339             REGTAIL(pRExC_state, chain, latest);
10340         }
10341         chain = latest;
10342         c++;
10343     }
10344     if (chain == NULL) {        /* Loop ran zero times. */
10345         chain = reg_node(pRExC_state, NOTHING);
10346         if (ret == NULL)
10347             ret = chain;
10348     }
10349     if (c == 1) {
10350         *flagp |= flags&SIMPLE;
10351     }
10352
10353     return ret;
10354 }
10355
10356 /*
10357  - regpiece - something followed by possible [*+?]
10358  *
10359  * Note that the branching code sequences used for ? and the general cases
10360  * of * and + are somewhat optimized:  they use the same NOTHING node as
10361  * both the endmarker for their branch list and the body of the last branch.
10362  * It might seem that this node could be dispensed with entirely, but the
10363  * endmarker role is not redundant.
10364  *
10365  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10366  * TRYAGAIN.
10367  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10368  * restarted.
10369  */
10370 STATIC regnode *
10371 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10372 {
10373     dVAR;
10374     regnode *ret;
10375     char op;
10376     char *next;
10377     I32 flags;
10378     const char * const origparse = RExC_parse;
10379     I32 min;
10380     I32 max = REG_INFTY;
10381 #ifdef RE_TRACK_PATTERN_OFFSETS
10382     char *parse_start;
10383 #endif
10384     const char *maxpos = NULL;
10385
10386     /* Save the original in case we change the emitted regop to a FAIL. */
10387     regnode * const orig_emit = RExC_emit;
10388
10389     GET_RE_DEBUG_FLAGS_DECL;
10390
10391     PERL_ARGS_ASSERT_REGPIECE;
10392
10393     DEBUG_PARSE("piec");
10394
10395     ret = regatom(pRExC_state, &flags,depth+1);
10396     if (ret == NULL) {
10397         if (flags & (TRYAGAIN|RESTART_UTF8))
10398             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10399         else
10400             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10401         return(NULL);
10402     }
10403
10404     op = *RExC_parse;
10405
10406     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10407         maxpos = NULL;
10408 #ifdef RE_TRACK_PATTERN_OFFSETS
10409         parse_start = RExC_parse; /* MJD */
10410 #endif
10411         next = RExC_parse + 1;
10412         while (isDIGIT(*next) || *next == ',') {
10413             if (*next == ',') {
10414                 if (maxpos)
10415                     break;
10416                 else
10417                     maxpos = next;
10418             }
10419             next++;
10420         }
10421         if (*next == '}') {             /* got one */
10422             if (!maxpos)
10423                 maxpos = next;
10424             RExC_parse++;
10425             min = atoi(RExC_parse);
10426             if (*maxpos == ',')
10427                 maxpos++;
10428             else
10429                 maxpos = RExC_parse;
10430             max = atoi(maxpos);
10431             if (!max && *maxpos != '0')
10432                 max = REG_INFTY;                /* meaning "infinity" */
10433             else if (max >= REG_INFTY)
10434                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10435             RExC_parse = next;
10436             nextchar(pRExC_state);
10437             if (max < min) {    /* If can't match, warn and optimize to fail
10438                                    unconditionally */
10439                 if (SIZE_ONLY) {
10440                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10441
10442                     /* We can't back off the size because we have to reserve
10443                      * enough space for all the things we are about to throw
10444                      * away, but we can shrink it by the ammount we are about
10445                      * to re-use here */
10446                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10447                 }
10448                 else {
10449                     RExC_emit = orig_emit;
10450                 }
10451                 ret = reg_node(pRExC_state, OPFAIL);
10452                 return ret;
10453             }
10454             else if (min == max
10455                      && RExC_parse < RExC_end
10456                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10457             {
10458                 if (SIZE_ONLY) {
10459                     ckWARN2reg(RExC_parse + 1,
10460                                "Useless use of greediness modifier '%c'",
10461                                *RExC_parse);
10462                 }
10463                 /* Absorb the modifier, so later code doesn't see nor use
10464                     * it */
10465                 nextchar(pRExC_state);
10466             }
10467
10468         do_curly:
10469             if ((flags&SIMPLE)) {
10470                 RExC_naughty += 2 + RExC_naughty / 2;
10471                 reginsert(pRExC_state, CURLY, ret, depth+1);
10472                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10473                 Set_Node_Cur_Length(ret, parse_start);
10474             }
10475             else {
10476                 regnode * const w = reg_node(pRExC_state, WHILEM);
10477
10478                 w->flags = 0;
10479                 REGTAIL(pRExC_state, ret, w);
10480                 if (!SIZE_ONLY && RExC_extralen) {
10481                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10482                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10483                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10484                 }
10485                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10486                                 /* MJD hk */
10487                 Set_Node_Offset(ret, parse_start+1);
10488                 Set_Node_Length(ret,
10489                                 op == '{' ? (RExC_parse - parse_start) : 1);
10490
10491                 if (!SIZE_ONLY && RExC_extralen)
10492                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10493                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10494                 if (SIZE_ONLY)
10495                     RExC_whilem_seen++, RExC_extralen += 3;
10496                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10497             }
10498             ret->flags = 0;
10499
10500             if (min > 0)
10501                 *flagp = WORST;
10502             if (max > 0)
10503                 *flagp |= HASWIDTH;
10504             if (!SIZE_ONLY) {
10505                 ARG1_SET(ret, (U16)min);
10506                 ARG2_SET(ret, (U16)max);
10507             }
10508             if (max == REG_INFTY)
10509                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10510
10511             goto nest_check;
10512         }
10513     }
10514
10515     if (!ISMULT1(op)) {
10516         *flagp = flags;
10517         return(ret);
10518     }
10519
10520 #if 0                           /* Now runtime fix should be reliable. */
10521
10522     /* if this is reinstated, don't forget to put this back into perldiag:
10523
10524             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10525
10526            (F) The part of the regexp subject to either the * or + quantifier
10527            could match an empty string. The {#} shows in the regular
10528            expression about where the problem was discovered.
10529
10530     */
10531
10532     if (!(flags&HASWIDTH) && op != '?')
10533       vFAIL("Regexp *+ operand could be empty");
10534 #endif
10535
10536 #ifdef RE_TRACK_PATTERN_OFFSETS
10537     parse_start = RExC_parse;
10538 #endif
10539     nextchar(pRExC_state);
10540
10541     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10542
10543     if (op == '*' && (flags&SIMPLE)) {
10544         reginsert(pRExC_state, STAR, ret, depth+1);
10545         ret->flags = 0;
10546         RExC_naughty += 4;
10547         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10548     }
10549     else if (op == '*') {
10550         min = 0;
10551         goto do_curly;
10552     }
10553     else if (op == '+' && (flags&SIMPLE)) {
10554         reginsert(pRExC_state, PLUS, ret, depth+1);
10555         ret->flags = 0;
10556         RExC_naughty += 3;
10557         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10558     }
10559     else if (op == '+') {
10560         min = 1;
10561         goto do_curly;
10562     }
10563     else if (op == '?') {
10564         min = 0; max = 1;
10565         goto do_curly;
10566     }
10567   nest_check:
10568     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10569         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10570         ckWARN2reg(RExC_parse,
10571                    "%"UTF8f" matches null string many times",
10572                    UTF8fARG(UTF, (RExC_parse >= origparse
10573                                  ? RExC_parse - origparse
10574                                  : 0),
10575                    origparse));
10576         (void)ReREFCNT_inc(RExC_rx_sv);
10577     }
10578
10579     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10580         nextchar(pRExC_state);
10581         reginsert(pRExC_state, MINMOD, ret, depth+1);
10582         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10583     }
10584     else
10585     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10586         regnode *ender;
10587         nextchar(pRExC_state);
10588         ender = reg_node(pRExC_state, SUCCEED);
10589         REGTAIL(pRExC_state, ret, ender);
10590         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10591         ret->flags = 0;
10592         ender = reg_node(pRExC_state, TAIL);
10593         REGTAIL(pRExC_state, ret, ender);
10594     }
10595
10596     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10597         RExC_parse++;
10598         vFAIL("Nested quantifiers");
10599     }
10600
10601     return(ret);
10602 }
10603
10604 STATIC bool
10605 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10606                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10607                       const bool strict   /* Apply stricter parsing rules? */
10608     )
10609 {
10610
10611  /* This is expected to be called by a parser routine that has recognized '\N'
10612    and needs to handle the rest. RExC_parse is expected to point at the first
10613    char following the N at the time of the call.  On successful return,
10614    RExC_parse has been updated to point to just after the sequence identified
10615    by this routine, and <*flagp> has been updated.
10616
10617    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10618    character class.
10619
10620    \N may begin either a named sequence, or if outside a character class, mean
10621    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10622    attempted to decide which, and in the case of a named sequence, converted it
10623    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10624    where c1... are the characters in the sequence.  For single-quoted regexes,
10625    the tokenizer passes the \N sequence through unchanged; this code will not
10626    attempt to determine this nor expand those, instead raising a syntax error.
10627    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10628    or there is no '}', it signals that this \N occurrence means to match a
10629    non-newline.
10630
10631    Only the \N{U+...} form should occur in a character class, for the same
10632    reason that '.' inside a character class means to just match a period: it
10633    just doesn't make sense.
10634
10635    The function raises an error (via vFAIL), and doesn't return for various
10636    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10637    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10638    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10639    only possible if node_p is non-NULL.
10640
10641
10642    If <valuep> is non-null, it means the caller can accept an input sequence
10643    consisting of a just a single code point; <*valuep> is set to that value
10644    if the input is such.
10645
10646    If <node_p> is non-null it signifies that the caller can accept any other
10647    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10648    is set as follows:
10649     1) \N means not-a-NL: points to a newly created REG_ANY node;
10650     2) \N{}:              points to a new NOTHING node;
10651     3) otherwise:         points to a new EXACT node containing the resolved
10652                           string.
10653    Note that FALSE is returned for single code point sequences if <valuep> is
10654    null.
10655  */
10656
10657     char * endbrace;    /* '}' following the name */
10658     char* p;
10659     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10660                            stream */
10661     bool has_multiple_chars; /* true if the input stream contains a sequence of
10662                                 more than one character */
10663
10664     GET_RE_DEBUG_FLAGS_DECL;
10665
10666     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10667
10668     GET_RE_DEBUG_FLAGS;
10669
10670     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10671
10672     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10673      * modifier.  The other meaning does not, so use a temporary until we find
10674      * out which we are being called with */
10675     p = (RExC_flags & RXf_PMf_EXTENDED)
10676         ? regwhite( pRExC_state, RExC_parse )
10677         : RExC_parse;
10678
10679     /* Disambiguate between \N meaning a named character versus \N meaning
10680      * [^\n].  The former is assumed when it can't be the latter. */
10681     if (*p != '{' || regcurly(p, FALSE)) {
10682         RExC_parse = p;
10683         if (! node_p) {
10684             /* no bare \N allowed in a charclass */
10685             if (in_char_class) {
10686                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10687             }
10688             return FALSE;
10689         }
10690         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10691                            current char */
10692         nextchar(pRExC_state);
10693         *node_p = reg_node(pRExC_state, REG_ANY);
10694         *flagp |= HASWIDTH|SIMPLE;
10695         RExC_naughty++;
10696         Set_Node_Length(*node_p, 1); /* MJD */
10697         return TRUE;
10698     }
10699
10700     /* Here, we have decided it should be a named character or sequence */
10701
10702     /* The test above made sure that the next real character is a '{', but
10703      * under the /x modifier, it could be separated by space (or a comment and
10704      * \n) and this is not allowed (for consistency with \x{...} and the
10705      * tokenizer handling of \N{NAME}). */
10706     if (*RExC_parse != '{') {
10707         vFAIL("Missing braces on \\N{}");
10708     }
10709
10710     RExC_parse++;       /* Skip past the '{' */
10711
10712     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10713         || ! (endbrace == RExC_parse            /* nothing between the {} */
10714               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10715                                                  */
10716                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10717                                                      */
10718     {
10719         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10720         vFAIL("\\N{NAME} must be resolved by the lexer");
10721     }
10722
10723     if (endbrace == RExC_parse) {   /* empty: \N{} */
10724         bool ret = TRUE;
10725         if (node_p) {
10726             *node_p = reg_node(pRExC_state,NOTHING);
10727         }
10728         else if (in_char_class) {
10729             if (SIZE_ONLY && in_char_class) {
10730                 if (strict) {
10731                     RExC_parse++;   /* Position after the "}" */
10732                     vFAIL("Zero length \\N{}");
10733                 }
10734                 else {
10735                     ckWARNreg(RExC_parse,
10736                               "Ignoring zero length \\N{} in character class");
10737                 }
10738             }
10739             ret = FALSE;
10740         }
10741         else {
10742             return FALSE;
10743         }
10744         nextchar(pRExC_state);
10745         return ret;
10746     }
10747
10748     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10749     RExC_parse += 2;    /* Skip past the 'U+' */
10750
10751     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10752
10753     /* Code points are separated by dots.  If none, there is only one code
10754      * point, and is terminated by the brace */
10755     has_multiple_chars = (endchar < endbrace);
10756
10757     if (valuep && (! has_multiple_chars || in_char_class)) {
10758         /* We only pay attention to the first char of
10759         multichar strings being returned in char classes. I kinda wonder
10760         if this makes sense as it does change the behaviour
10761         from earlier versions, OTOH that behaviour was broken
10762         as well. XXX Solution is to recharacterize as
10763         [rest-of-class]|multi1|multi2... */
10764
10765         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10766         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10767             | PERL_SCAN_DISALLOW_PREFIX
10768             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10769
10770         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10771
10772         /* The tokenizer should have guaranteed validity, but it's possible to
10773          * bypass it by using single quoting, so check */
10774         if (length_of_hex == 0
10775             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10776         {
10777             RExC_parse += length_of_hex;        /* Includes all the valid */
10778             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10779                             ? UTF8SKIP(RExC_parse)
10780                             : 1;
10781             /* Guard against malformed utf8 */
10782             if (RExC_parse >= endchar) {
10783                 RExC_parse = endchar;
10784             }
10785             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10786         }
10787
10788         if (in_char_class && has_multiple_chars) {
10789             if (strict) {
10790                 RExC_parse = endbrace;
10791                 vFAIL("\\N{} in character class restricted to one character");
10792             }
10793             else {
10794                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10795             }
10796         }
10797
10798         RExC_parse = endbrace + 1;
10799     }
10800     else if (! node_p || ! has_multiple_chars) {
10801
10802         /* Here, the input is legal, but not according to the caller's
10803          * options.  We fail without advancing the parse, so that the
10804          * caller can try again */
10805         RExC_parse = p;
10806         return FALSE;
10807     }
10808     else {
10809
10810         /* What is done here is to convert this to a sub-pattern of the form
10811          * (?:\x{char1}\x{char2}...)
10812          * and then call reg recursively.  That way, it retains its atomicness,
10813          * while not having to worry about special handling that some code
10814          * points may have.  toke.c has converted the original Unicode values
10815          * to native, so that we can just pass on the hex values unchanged.  We
10816          * do have to set a flag to keep recoding from happening in the
10817          * recursion */
10818
10819         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10820         STRLEN len;
10821         char *orig_end = RExC_end;
10822         I32 flags;
10823
10824         while (RExC_parse < endbrace) {
10825
10826             /* Convert to notation the rest of the code understands */
10827             sv_catpv(substitute_parse, "\\x{");
10828             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10829             sv_catpv(substitute_parse, "}");
10830
10831             /* Point to the beginning of the next character in the sequence. */
10832             RExC_parse = endchar + 1;
10833             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10834         }
10835         sv_catpv(substitute_parse, ")");
10836
10837         RExC_parse = SvPV(substitute_parse, len);
10838
10839         /* Don't allow empty number */
10840         if (len < 8) {
10841             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10842         }
10843         RExC_end = RExC_parse + len;
10844
10845         /* The values are Unicode, and therefore not subject to recoding */
10846         RExC_override_recoding = 1;
10847
10848         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10849             if (flags & RESTART_UTF8) {
10850                 *flagp = RESTART_UTF8;
10851                 return FALSE;
10852             }
10853             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10854                   (UV) flags);
10855         }
10856         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10857
10858         RExC_parse = endbrace;
10859         RExC_end = orig_end;
10860         RExC_override_recoding = 0;
10861
10862         nextchar(pRExC_state);
10863     }
10864
10865     return TRUE;
10866 }
10867
10868
10869 /*
10870  * reg_recode
10871  *
10872  * It returns the code point in utf8 for the value in *encp.
10873  *    value: a code value in the source encoding
10874  *    encp:  a pointer to an Encode object
10875  *
10876  * If the result from Encode is not a single character,
10877  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10878  */
10879 STATIC UV
10880 S_reg_recode(pTHX_ const char value, SV **encp)
10881 {
10882     STRLEN numlen = 1;
10883     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10884     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10885     const STRLEN newlen = SvCUR(sv);
10886     UV uv = UNICODE_REPLACEMENT;
10887
10888     PERL_ARGS_ASSERT_REG_RECODE;
10889
10890     if (newlen)
10891         uv = SvUTF8(sv)
10892              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10893              : *(U8*)s;
10894
10895     if (!newlen || numlen != newlen) {
10896         uv = UNICODE_REPLACEMENT;
10897         *encp = NULL;
10898     }
10899     return uv;
10900 }
10901
10902 PERL_STATIC_INLINE U8
10903 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10904 {
10905     U8 op;
10906
10907     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10908
10909     if (! FOLD) {
10910         return EXACT;
10911     }
10912
10913     op = get_regex_charset(RExC_flags);
10914     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10915         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10916                  been, so there is no hole */
10917     }
10918
10919     return op + EXACTF;
10920 }
10921
10922 PERL_STATIC_INLINE void
10923 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10924                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10925                          bool downgradable)
10926 {
10927     /* This knows the details about sizing an EXACTish node, setting flags for
10928      * it (by setting <*flagp>, and potentially populating it with a single
10929      * character.
10930      *
10931      * If <len> (the length in bytes) is non-zero, this function assumes that
10932      * the node has already been populated, and just does the sizing.  In this
10933      * case <code_point> should be the final code point that has already been
10934      * placed into the node.  This value will be ignored except that under some
10935      * circumstances <*flagp> is set based on it.
10936      *
10937      * If <len> is zero, the function assumes that the node is to contain only
10938      * the single character given by <code_point> and calculates what <len>
10939      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10940      * additionally will populate the node's STRING with <code_point> or its
10941      * fold if folding.
10942      *
10943      * In both cases <*flagp> is appropriately set
10944      *
10945      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10946      * 255, must be folded (the former only when the rules indicate it can
10947      * match 'ss')
10948      *
10949      * When it does the populating, it looks at the flag 'downgradable'.  If
10950      * true with a node that folds, it checks if the single code point
10951      * participates in a fold, and if not downgrades the node to an EXACT.
10952      * This helps the optimizer */
10953
10954     bool len_passed_in = cBOOL(len != 0);
10955     U8 character[UTF8_MAXBYTES_CASE+1];
10956
10957     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10958
10959     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10960      * sizing difference, and is extra work that is thrown away */
10961     if (downgradable && ! PASS2) {
10962         downgradable = FALSE;
10963     }
10964
10965     if (! len_passed_in) {
10966         if (UTF) {
10967             if (UNI_IS_INVARIANT(code_point)) {
10968                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10969                     *character = (U8) code_point;
10970                 }
10971                 else { /* Here is /i and not /l (toFOLD() is defined on just
10972                           ASCII, which isn't the same thing as INVARIANT on
10973                           EBCDIC, but it works there, as the extra invariants
10974                           fold to themselves) */
10975                     *character = toFOLD((U8) code_point);
10976                     if (downgradable
10977                         && *character == code_point
10978                         && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
10979                     {
10980                         OP(node) = EXACT;
10981                     }
10982                 }
10983                 len = 1;
10984             }
10985             else if (FOLD && (! LOC
10986                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10987             {   /* Folding, and ok to do so now */
10988                 UV folded = _to_uni_fold_flags(
10989                                    code_point,
10990                                    character,
10991                                    &len,
10992                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
10993                                                       ? FOLD_FLAGS_NOMIX_ASCII
10994                                                       : 0));
10995                 if (downgradable
10996                     && folded == code_point
10997                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
10998                 {
10999                     OP(node) = EXACT;
11000                 }
11001             }
11002             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11003
11004                 /* Not folding this cp, and can output it directly */
11005                 *character = UTF8_TWO_BYTE_HI(code_point);
11006                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11007                 len = 2;
11008             }
11009             else {
11010                 uvchr_to_utf8( character, code_point);
11011                 len = UTF8SKIP(character);
11012             }
11013         } /* Else pattern isn't UTF8.  */
11014         else if (! FOLD) {
11015             *character = (U8) code_point;
11016             len = 1;
11017         } /* Else is folded non-UTF8 */
11018         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11019
11020             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11021              * comments at join_exact()); */
11022             *character = (U8) code_point;
11023             len = 1;
11024
11025             /* Can turn into an EXACT node if we know the fold at compile time,
11026              * and it folds to itself and doesn't particpate in other folds */
11027             if (downgradable
11028                 && ! LOC
11029                 && PL_fold_latin1[code_point] == code_point
11030                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11031                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11032             {
11033                 OP(node) = EXACT;
11034             }
11035         } /* else is Sharp s.  May need to fold it */
11036         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11037             *character = 's';
11038             *(character + 1) = 's';
11039             len = 2;
11040         }
11041         else {
11042             *character = LATIN_SMALL_LETTER_SHARP_S;
11043             len = 1;
11044         }
11045     }
11046
11047     if (SIZE_ONLY) {
11048         RExC_size += STR_SZ(len);
11049     }
11050     else {
11051         RExC_emit += STR_SZ(len);
11052         STR_LEN(node) = len;
11053         if (! len_passed_in) {
11054             Copy((char *) character, STRING(node), len, char);
11055         }
11056     }
11057
11058     *flagp |= HASWIDTH;
11059
11060     /* A single character node is SIMPLE, except for the special-cased SHARP S
11061      * under /di. */
11062     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11063         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11064             || ! FOLD || ! DEPENDS_SEMANTICS))
11065     {
11066         *flagp |= SIMPLE;
11067     }
11068
11069     /* The OP may not be well defined in PASS1 */
11070     if (PASS2 && OP(node) == EXACTFL) {
11071         RExC_contains_locale = 1;
11072     }
11073 }
11074
11075
11076 /* return atoi(p), unless it's too big to sensibly be a backref,
11077  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11078
11079 static I32
11080 S_backref_value(char *p)
11081 {
11082     char *q = p;
11083
11084     for (;isDIGIT(*q); q++) {} /* calculate length of num */
11085     if (q - p == 0 || q - p > 9)
11086         return I32_MAX;
11087     return atoi(p);
11088 }
11089
11090
11091 /*
11092  - regatom - the lowest level
11093
11094    Try to identify anything special at the start of the pattern. If there
11095    is, then handle it as required. This may involve generating a single regop,
11096    such as for an assertion; or it may involve recursing, such as to
11097    handle a () structure.
11098
11099    If the string doesn't start with something special then we gobble up
11100    as much literal text as we can.
11101
11102    Once we have been able to handle whatever type of thing started the
11103    sequence, we return.
11104
11105    Note: we have to be careful with escapes, as they can be both literal
11106    and special, and in the case of \10 and friends, context determines which.
11107
11108    A summary of the code structure is:
11109
11110    switch (first_byte) {
11111         cases for each special:
11112             handle this special;
11113             break;
11114         case '\\':
11115             switch (2nd byte) {
11116                 cases for each unambiguous special:
11117                     handle this special;
11118                     break;
11119                 cases for each ambigous special/literal:
11120                     disambiguate;
11121                     if (special)  handle here
11122                     else goto defchar;
11123                 default: // unambiguously literal:
11124                     goto defchar;
11125             }
11126         default:  // is a literal char
11127             // FALL THROUGH
11128         defchar:
11129             create EXACTish node for literal;
11130             while (more input and node isn't full) {
11131                 switch (input_byte) {
11132                    cases for each special;
11133                        make sure parse pointer is set so that the next call to
11134                            regatom will see this special first
11135                        goto loopdone; // EXACTish node terminated by prev. char
11136                    default:
11137                        append char to EXACTISH node;
11138                 }
11139                 get next input byte;
11140             }
11141         loopdone:
11142    }
11143    return the generated node;
11144
11145    Specifically there are two separate switches for handling
11146    escape sequences, with the one for handling literal escapes requiring
11147    a dummy entry for all of the special escapes that are actually handled
11148    by the other.
11149
11150    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11151    TRYAGAIN.
11152    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11153    restarted.
11154    Otherwise does not return NULL.
11155 */
11156
11157 STATIC regnode *
11158 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11159 {
11160     dVAR;
11161     regnode *ret = NULL;
11162     I32 flags = 0;
11163     char *parse_start = RExC_parse;
11164     U8 op;
11165     int invert = 0;
11166
11167     GET_RE_DEBUG_FLAGS_DECL;
11168
11169     *flagp = WORST;             /* Tentatively. */
11170
11171     DEBUG_PARSE("atom");
11172
11173     PERL_ARGS_ASSERT_REGATOM;
11174
11175 tryagain:
11176     switch ((U8)*RExC_parse) {
11177     case '^':
11178         RExC_seen_zerolen++;
11179         nextchar(pRExC_state);
11180         if (RExC_flags & RXf_PMf_MULTILINE)
11181             ret = reg_node(pRExC_state, MBOL);
11182         else if (RExC_flags & RXf_PMf_SINGLELINE)
11183             ret = reg_node(pRExC_state, SBOL);
11184         else
11185             ret = reg_node(pRExC_state, BOL);
11186         Set_Node_Length(ret, 1); /* MJD */
11187         break;
11188     case '$':
11189         nextchar(pRExC_state);
11190         if (*RExC_parse)
11191             RExC_seen_zerolen++;
11192         if (RExC_flags & RXf_PMf_MULTILINE)
11193             ret = reg_node(pRExC_state, MEOL);
11194         else if (RExC_flags & RXf_PMf_SINGLELINE)
11195             ret = reg_node(pRExC_state, SEOL);
11196         else
11197             ret = reg_node(pRExC_state, EOL);
11198         Set_Node_Length(ret, 1); /* MJD */
11199         break;
11200     case '.':
11201         nextchar(pRExC_state);
11202         if (RExC_flags & RXf_PMf_SINGLELINE)
11203             ret = reg_node(pRExC_state, SANY);
11204         else
11205             ret = reg_node(pRExC_state, REG_ANY);
11206         *flagp |= HASWIDTH|SIMPLE;
11207         RExC_naughty++;
11208         Set_Node_Length(ret, 1); /* MJD */
11209         break;
11210     case '[':
11211     {
11212         char * const oregcomp_parse = ++RExC_parse;
11213         ret = regclass(pRExC_state, flagp,depth+1,
11214                        FALSE, /* means parse the whole char class */
11215                        TRUE, /* allow multi-char folds */
11216                        FALSE, /* don't silence non-portable warnings. */
11217                        NULL);
11218         if (*RExC_parse != ']') {
11219             RExC_parse = oregcomp_parse;
11220             vFAIL("Unmatched [");
11221         }
11222         if (ret == NULL) {
11223             if (*flagp & RESTART_UTF8)
11224                 return NULL;
11225             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11226                   (UV) *flagp);
11227         }
11228         nextchar(pRExC_state);
11229         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11230         break;
11231     }
11232     case '(':
11233         nextchar(pRExC_state);
11234         ret = reg(pRExC_state, 2, &flags,depth+1);
11235         if (ret == NULL) {
11236                 if (flags & TRYAGAIN) {
11237                     if (RExC_parse == RExC_end) {
11238                          /* Make parent create an empty node if needed. */
11239                         *flagp |= TRYAGAIN;
11240                         return(NULL);
11241                     }
11242                     goto tryagain;
11243                 }
11244                 if (flags & RESTART_UTF8) {
11245                     *flagp = RESTART_UTF8;
11246                     return NULL;
11247                 }
11248                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11249                                                                  (UV) flags);
11250         }
11251         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11252         break;
11253     case '|':
11254     case ')':
11255         if (flags & TRYAGAIN) {
11256             *flagp |= TRYAGAIN;
11257             return NULL;
11258         }
11259         vFAIL("Internal urp");
11260                                 /* Supposed to be caught earlier. */
11261         break;
11262     case '{':
11263         if (!regcurly(RExC_parse, FALSE)) {
11264             RExC_parse++;
11265             goto defchar;
11266         }
11267         /* FALL THROUGH */
11268     case '?':
11269     case '+':
11270     case '*':
11271         RExC_parse++;
11272         vFAIL("Quantifier follows nothing");
11273         break;
11274     case '\\':
11275         /* Special Escapes
11276
11277            This switch handles escape sequences that resolve to some kind
11278            of special regop and not to literal text. Escape sequnces that
11279            resolve to literal text are handled below in the switch marked
11280            "Literal Escapes".
11281
11282            Every entry in this switch *must* have a corresponding entry
11283            in the literal escape switch. However, the opposite is not
11284            required, as the default for this switch is to jump to the
11285            literal text handling code.
11286         */
11287         switch ((U8)*++RExC_parse) {
11288             U8 arg;
11289         /* Special Escapes */
11290         case 'A':
11291             RExC_seen_zerolen++;
11292             ret = reg_node(pRExC_state, SBOL);
11293             *flagp |= SIMPLE;
11294             goto finish_meta_pat;
11295         case 'G':
11296             ret = reg_node(pRExC_state, GPOS);
11297             RExC_seen |= REG_GPOS_SEEN;
11298             *flagp |= SIMPLE;
11299             goto finish_meta_pat;
11300         case 'K':
11301             RExC_seen_zerolen++;
11302             ret = reg_node(pRExC_state, KEEPS);
11303             *flagp |= SIMPLE;
11304             /* XXX:dmq : disabling in-place substitution seems to
11305              * be necessary here to avoid cases of memory corruption, as
11306              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11307              */
11308             RExC_seen |= REG_LOOKBEHIND_SEEN;
11309             goto finish_meta_pat;
11310         case 'Z':
11311             ret = reg_node(pRExC_state, SEOL);
11312             *flagp |= SIMPLE;
11313             RExC_seen_zerolen++;                /* Do not optimize RE away */
11314             goto finish_meta_pat;
11315         case 'z':
11316             ret = reg_node(pRExC_state, EOS);
11317             *flagp |= SIMPLE;
11318             RExC_seen_zerolen++;                /* Do not optimize RE away */
11319             goto finish_meta_pat;
11320         case 'C':
11321             ret = reg_node(pRExC_state, CANY);
11322             RExC_seen |= REG_CANY_SEEN;
11323             *flagp |= HASWIDTH|SIMPLE;
11324             goto finish_meta_pat;
11325         case 'X':
11326             ret = reg_node(pRExC_state, CLUMP);
11327             *flagp |= HASWIDTH;
11328             goto finish_meta_pat;
11329
11330         case 'W':
11331             invert = 1;
11332             /* FALLTHROUGH */
11333         case 'w':
11334             arg = ANYOF_WORDCHAR;
11335             goto join_posix;
11336
11337         case 'b':
11338             RExC_seen_zerolen++;
11339             RExC_seen |= REG_LOOKBEHIND_SEEN;
11340             op = BOUND + get_regex_charset(RExC_flags);
11341             if (op > BOUNDA) {  /* /aa is same as /a */
11342                 op = BOUNDA;
11343             }
11344             else if (op == BOUNDL) {
11345                 RExC_contains_locale = 1;
11346             }
11347             ret = reg_node(pRExC_state, op);
11348             FLAGS(ret) = get_regex_charset(RExC_flags);
11349             *flagp |= SIMPLE;
11350             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11351                 /* diag_listed_as: Use "%s" instead of "%s" */
11352                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11353             }
11354             goto finish_meta_pat;
11355         case 'B':
11356             RExC_seen_zerolen++;
11357             RExC_seen |= REG_LOOKBEHIND_SEEN;
11358             op = NBOUND + get_regex_charset(RExC_flags);
11359             if (op > NBOUNDA) { /* /aa is same as /a */
11360                 op = NBOUNDA;
11361             }
11362             else if (op == NBOUNDL) {
11363                 RExC_contains_locale = 1;
11364             }
11365             ret = reg_node(pRExC_state, op);
11366             FLAGS(ret) = get_regex_charset(RExC_flags);
11367             *flagp |= SIMPLE;
11368             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11369                 /* diag_listed_as: Use "%s" instead of "%s" */
11370                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11371             }
11372             goto finish_meta_pat;
11373
11374         case 'D':
11375             invert = 1;
11376             /* FALLTHROUGH */
11377         case 'd':
11378             arg = ANYOF_DIGIT;
11379             goto join_posix;
11380
11381         case 'R':
11382             ret = reg_node(pRExC_state, LNBREAK);
11383             *flagp |= HASWIDTH|SIMPLE;
11384             goto finish_meta_pat;
11385
11386         case 'H':
11387             invert = 1;
11388             /* FALLTHROUGH */
11389         case 'h':
11390             arg = ANYOF_BLANK;
11391             op = POSIXU;
11392             goto join_posix_op_known;
11393
11394         case 'V':
11395             invert = 1;
11396             /* FALLTHROUGH */
11397         case 'v':
11398             arg = ANYOF_VERTWS;
11399             op = POSIXU;
11400             goto join_posix_op_known;
11401
11402         case 'S':
11403             invert = 1;
11404             /* FALLTHROUGH */
11405         case 's':
11406             arg = ANYOF_SPACE;
11407
11408         join_posix:
11409
11410             op = POSIXD + get_regex_charset(RExC_flags);
11411             if (op > POSIXA) {  /* /aa is same as /a */
11412                 op = POSIXA;
11413             }
11414             else if (op == POSIXL) {
11415                 RExC_contains_locale = 1;
11416             }
11417
11418         join_posix_op_known:
11419
11420             if (invert) {
11421                 op += NPOSIXD - POSIXD;
11422             }
11423
11424             ret = reg_node(pRExC_state, op);
11425             if (! SIZE_ONLY) {
11426                 FLAGS(ret) = namedclass_to_classnum(arg);
11427             }
11428
11429             *flagp |= HASWIDTH|SIMPLE;
11430             /* FALL THROUGH */
11431
11432          finish_meta_pat:
11433             nextchar(pRExC_state);
11434             Set_Node_Length(ret, 2); /* MJD */
11435             break;
11436         case 'p':
11437         case 'P':
11438             {
11439 #ifdef DEBUGGING
11440                 char* parse_start = RExC_parse - 2;
11441 #endif
11442
11443                 RExC_parse--;
11444
11445                 ret = regclass(pRExC_state, flagp,depth+1,
11446                                TRUE, /* means just parse this element */
11447                                FALSE, /* don't allow multi-char folds */
11448                                FALSE, /* don't silence non-portable warnings.
11449                                          It would be a bug if these returned
11450                                          non-portables */
11451                                NULL);
11452                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11453                    are allowed.  */
11454                 if (!ret)
11455                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11456                           (UV) *flagp);
11457
11458                 RExC_parse--;
11459
11460                 Set_Node_Offset(ret, parse_start + 2);
11461                 Set_Node_Cur_Length(ret, parse_start);
11462                 nextchar(pRExC_state);
11463             }
11464             break;
11465         case 'N':
11466             /* Handle \N and \N{NAME} with multiple code points here and not
11467              * below because it can be multicharacter. join_exact() will join
11468              * them up later on.  Also this makes sure that things like
11469              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11470              * The options to the grok function call causes it to fail if the
11471              * sequence is just a single code point.  We then go treat it as
11472              * just another character in the current EXACT node, and hence it
11473              * gets uniform treatment with all the other characters.  The
11474              * special treatment for quantifiers is not needed for such single
11475              * character sequences */
11476             ++RExC_parse;
11477             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11478                                 FALSE /* not strict */ )) {
11479                 if (*flagp & RESTART_UTF8)
11480                     return NULL;
11481                 RExC_parse--;
11482                 goto defchar;
11483             }
11484             break;
11485         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11486         parse_named_seq:
11487         {
11488             char ch= RExC_parse[1];
11489             if (ch != '<' && ch != '\'' && ch != '{') {
11490                 RExC_parse++;
11491                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11492                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11493             } else {
11494                 /* this pretty much dupes the code for (?P=...) in reg(), if
11495                    you change this make sure you change that */
11496                 char* name_start = (RExC_parse += 2);
11497                 U32 num = 0;
11498                 SV *sv_dat = reg_scan_name(pRExC_state,
11499                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11500                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11501                 if (RExC_parse == name_start || *RExC_parse != ch)
11502                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11503                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11504
11505                 if (!SIZE_ONLY) {
11506                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11507                     RExC_rxi->data->data[num]=(void*)sv_dat;
11508                     SvREFCNT_inc_simple_void(sv_dat);
11509                 }
11510
11511                 RExC_sawback = 1;
11512                 ret = reganode(pRExC_state,
11513                                ((! FOLD)
11514                                  ? NREF
11515                                  : (ASCII_FOLD_RESTRICTED)
11516                                    ? NREFFA
11517                                    : (AT_LEAST_UNI_SEMANTICS)
11518                                      ? NREFFU
11519                                      : (LOC)
11520                                        ? NREFFL
11521                                        : NREFF),
11522                                 num);
11523                 *flagp |= HASWIDTH;
11524
11525                 /* override incorrect value set in reganode MJD */
11526                 Set_Node_Offset(ret, parse_start+1);
11527                 Set_Node_Cur_Length(ret, parse_start);
11528                 nextchar(pRExC_state);
11529
11530             }
11531             break;
11532         }
11533         case 'g':
11534         case '1': case '2': case '3': case '4':
11535         case '5': case '6': case '7': case '8': case '9':
11536             {
11537                 I32 num;
11538                 bool hasbrace = 0;
11539
11540                 if (*RExC_parse == 'g') {
11541                     bool isrel = 0;
11542
11543                     RExC_parse++;
11544                     if (*RExC_parse == '{') {
11545                         RExC_parse++;
11546                         hasbrace = 1;
11547                     }
11548                     if (*RExC_parse == '-') {
11549                         RExC_parse++;
11550                         isrel = 1;
11551                     }
11552                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11553                         if (isrel) RExC_parse--;
11554                         RExC_parse -= 2;
11555                         goto parse_named_seq;
11556                     }
11557
11558                     num = S_backref_value(RExC_parse);
11559                     if (num == 0)
11560                         vFAIL("Reference to invalid group 0");
11561                     else if (num == I32_MAX) {
11562                          if (isDIGIT(*RExC_parse))
11563                             vFAIL("Reference to nonexistent group");
11564                         else
11565                             vFAIL("Unterminated \\g... pattern");
11566                     }
11567
11568                     if (isrel) {
11569                         num = RExC_npar - num;
11570                         if (num < 1)
11571                             vFAIL("Reference to nonexistent or unclosed group");
11572                     }
11573                 }
11574                 else {
11575                     num = S_backref_value(RExC_parse);
11576                     /* bare \NNN might be backref or octal - if it is larger than or equal
11577                      * RExC_npar then it is assumed to be and octal escape.
11578                      * Note RExC_npar is +1 from the actual number of parens*/
11579                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11580                             && *RExC_parse != '8' && *RExC_parse != '9'))
11581                     {
11582                         /* Probably a character specified in octal, e.g. \35 */
11583                         goto defchar;
11584                     }
11585                 }
11586
11587                 /* at this point RExC_parse definitely points to a backref
11588                  * number */
11589                 {
11590 #ifdef RE_TRACK_PATTERN_OFFSETS
11591                     char * const parse_start = RExC_parse - 1; /* MJD */
11592 #endif
11593                     while (isDIGIT(*RExC_parse))
11594                         RExC_parse++;
11595                     if (hasbrace) {
11596                         if (*RExC_parse != '}')
11597                             vFAIL("Unterminated \\g{...} pattern");
11598                         RExC_parse++;
11599                     }
11600                     if (!SIZE_ONLY) {
11601                         if (num > (I32)RExC_rx->nparens)
11602                             vFAIL("Reference to nonexistent group");
11603                     }
11604                     RExC_sawback = 1;
11605                     ret = reganode(pRExC_state,
11606                                    ((! FOLD)
11607                                      ? REF
11608                                      : (ASCII_FOLD_RESTRICTED)
11609                                        ? REFFA
11610                                        : (AT_LEAST_UNI_SEMANTICS)
11611                                          ? REFFU
11612                                          : (LOC)
11613                                            ? REFFL
11614                                            : REFF),
11615                                     num);
11616                     *flagp |= HASWIDTH;
11617
11618                     /* override incorrect value set in reganode MJD */
11619                     Set_Node_Offset(ret, parse_start+1);
11620                     Set_Node_Cur_Length(ret, parse_start);
11621                     RExC_parse--;
11622                     nextchar(pRExC_state);
11623                 }
11624             }
11625             break;
11626         case '\0':
11627             if (RExC_parse >= RExC_end)
11628                 FAIL("Trailing \\");
11629             /* FALL THROUGH */
11630         default:
11631             /* Do not generate "unrecognized" warnings here, we fall
11632                back into the quick-grab loop below */
11633             parse_start--;
11634             goto defchar;
11635         }
11636         break;
11637
11638     case '#':
11639         if (RExC_flags & RXf_PMf_EXTENDED) {
11640             if ( reg_skipcomment( pRExC_state ) )
11641                 goto tryagain;
11642         }
11643         /* FALL THROUGH */
11644
11645     default:
11646
11647             parse_start = RExC_parse - 1;
11648
11649             RExC_parse++;
11650
11651         defchar: {
11652             STRLEN len = 0;
11653             UV ender = 0;
11654             char *p;
11655             char *s;
11656 #define MAX_NODE_STRING_SIZE 127
11657             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11658             char *s0;
11659             U8 upper_parse = MAX_NODE_STRING_SIZE;
11660             U8 node_type = compute_EXACTish(pRExC_state);
11661             bool next_is_quantifier;
11662             char * oldp = NULL;
11663
11664             /* We can convert EXACTF nodes to EXACTFU if they contain only
11665              * characters that match identically regardless of the target
11666              * string's UTF8ness.  The reason to do this is that EXACTF is not
11667              * trie-able, EXACTFU is.
11668              *
11669              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11670              * contain only above-Latin1 characters (hence must be in UTF8),
11671              * which don't participate in folds with Latin1-range characters,
11672              * as the latter's folds aren't known until runtime.  (We don't
11673              * need to figure this out until pass 2) */
11674             bool maybe_exactfu = PASS2
11675                                && (node_type == EXACTF || node_type == EXACTFL);
11676
11677             /* If a folding node contains only code points that don't
11678              * participate in folds, it can be changed into an EXACT node,
11679              * which allows the optimizer more things to look for */
11680             bool maybe_exact;
11681
11682             ret = reg_node(pRExC_state, node_type);
11683
11684             /* In pass1, folded, we use a temporary buffer instead of the
11685              * actual node, as the node doesn't exist yet */
11686             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11687
11688             s0 = s;
11689
11690         reparse:
11691
11692             /* We do the EXACTFish to EXACT node only if folding.  (And we
11693              * don't need to figure this out until pass 2) */
11694             maybe_exact = FOLD && PASS2;
11695
11696             /* XXX The node can hold up to 255 bytes, yet this only goes to
11697              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11698              * 255 allows us to not have to worry about overflow due to
11699              * converting to utf8 and fold expansion, but that value is
11700              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11701              * split up by this limit into a single one using the real max of
11702              * 255.  Even at 127, this breaks under rare circumstances.  If
11703              * folding, we do not want to split a node at a character that is a
11704              * non-final in a multi-char fold, as an input string could just
11705              * happen to want to match across the node boundary.  The join
11706              * would solve that problem if the join actually happens.  But a
11707              * series of more than two nodes in a row each of 127 would cause
11708              * the first join to succeed to get to 254, but then there wouldn't
11709              * be room for the next one, which could at be one of those split
11710              * multi-char folds.  I don't know of any fool-proof solution.  One
11711              * could back off to end with only a code point that isn't such a
11712              * non-final, but it is possible for there not to be any in the
11713              * entire node. */
11714             for (p = RExC_parse - 1;
11715                  len < upper_parse && p < RExC_end;
11716                  len++)
11717             {
11718                 oldp = p;
11719
11720                 if (RExC_flags & RXf_PMf_EXTENDED)
11721                     p = regwhite( pRExC_state, p );
11722                 switch ((U8)*p) {
11723                 case '^':
11724                 case '$':
11725                 case '.':
11726                 case '[':
11727                 case '(':
11728                 case ')':
11729                 case '|':
11730                     goto loopdone;
11731                 case '\\':
11732                     /* Literal Escapes Switch
11733
11734                        This switch is meant to handle escape sequences that
11735                        resolve to a literal character.
11736
11737                        Every escape sequence that represents something
11738                        else, like an assertion or a char class, is handled
11739                        in the switch marked 'Special Escapes' above in this
11740                        routine, but also has an entry here as anything that
11741                        isn't explicitly mentioned here will be treated as
11742                        an unescaped equivalent literal.
11743                     */
11744
11745                     switch ((U8)*++p) {
11746                     /* These are all the special escapes. */
11747                     case 'A':             /* Start assertion */
11748                     case 'b': case 'B':   /* Word-boundary assertion*/
11749                     case 'C':             /* Single char !DANGEROUS! */
11750                     case 'd': case 'D':   /* digit class */
11751                     case 'g': case 'G':   /* generic-backref, pos assertion */
11752                     case 'h': case 'H':   /* HORIZWS */
11753                     case 'k': case 'K':   /* named backref, keep marker */
11754                     case 'p': case 'P':   /* Unicode property */
11755                               case 'R':   /* LNBREAK */
11756                     case 's': case 'S':   /* space class */
11757                     case 'v': case 'V':   /* VERTWS */
11758                     case 'w': case 'W':   /* word class */
11759                     case 'X':             /* eXtended Unicode "combining
11760                                              character sequence" */
11761                     case 'z': case 'Z':   /* End of line/string assertion */
11762                         --p;
11763                         goto loopdone;
11764
11765                     /* Anything after here is an escape that resolves to a
11766                        literal. (Except digits, which may or may not)
11767                      */
11768                     case 'n':
11769                         ender = '\n';
11770                         p++;
11771                         break;
11772                     case 'N': /* Handle a single-code point named character. */
11773                         /* The options cause it to fail if a multiple code
11774                          * point sequence.  Handle those in the switch() above
11775                          * */
11776                         RExC_parse = p + 1;
11777                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11778                                             flagp, depth, FALSE,
11779                                             FALSE /* not strict */ ))
11780                         {
11781                             if (*flagp & RESTART_UTF8)
11782                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11783                             RExC_parse = p = oldp;
11784                             goto loopdone;
11785                         }
11786                         p = RExC_parse;
11787                         if (ender > 0xff) {
11788                             REQUIRE_UTF8;
11789                         }
11790                         break;
11791                     case 'r':
11792                         ender = '\r';
11793                         p++;
11794                         break;
11795                     case 't':
11796                         ender = '\t';
11797                         p++;
11798                         break;
11799                     case 'f':
11800                         ender = '\f';
11801                         p++;
11802                         break;
11803                     case 'e':
11804                           ender = ASCII_TO_NATIVE('\033');
11805                         p++;
11806                         break;
11807                     case 'a':
11808                           ender = '\a';
11809                         p++;
11810                         break;
11811                     case 'o':
11812                         {
11813                             UV result;
11814                             const char* error_msg;
11815
11816                             bool valid = grok_bslash_o(&p,
11817                                                        &result,
11818                                                        &error_msg,
11819                                                        TRUE, /* out warnings */
11820                                                        FALSE, /* not strict */
11821                                                        TRUE, /* Output warnings
11822                                                                 for non-
11823                                                                 portables */
11824                                                        UTF);
11825                             if (! valid) {
11826                                 RExC_parse = p; /* going to die anyway; point
11827                                                    to exact spot of failure */
11828                                 vFAIL(error_msg);
11829                             }
11830                             ender = result;
11831                             if (PL_encoding && ender < 0x100) {
11832                                 goto recode_encoding;
11833                             }
11834                             if (ender > 0xff) {
11835                                 REQUIRE_UTF8;
11836                             }
11837                             break;
11838                         }
11839                     case 'x':
11840                         {
11841                             UV result = UV_MAX; /* initialize to erroneous
11842                                                    value */
11843                             const char* error_msg;
11844
11845                             bool valid = grok_bslash_x(&p,
11846                                                        &result,
11847                                                        &error_msg,
11848                                                        TRUE, /* out warnings */
11849                                                        FALSE, /* not strict */
11850                                                        TRUE, /* Output warnings
11851                                                                 for non-
11852                                                                 portables */
11853                                                        UTF);
11854                             if (! valid) {
11855                                 RExC_parse = p; /* going to die anyway; point
11856                                                    to exact spot of failure */
11857                                 vFAIL(error_msg);
11858                             }
11859                             ender = result;
11860
11861                             if (PL_encoding && ender < 0x100) {
11862                                 goto recode_encoding;
11863                             }
11864                             if (ender > 0xff) {
11865                                 REQUIRE_UTF8;
11866                             }
11867                             break;
11868                         }
11869                     case 'c':
11870                         p++;
11871                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11872                         break;
11873                     case '8': case '9': /* must be a backreference */
11874                         --p;
11875                         goto loopdone;
11876                     case '1': case '2': case '3':case '4':
11877                     case '5': case '6': case '7':
11878                         /* When we parse backslash escapes there is ambiguity
11879                          * between backreferences and octal escapes. Any escape
11880                          * from \1 - \9 is a backreference, any multi-digit
11881                          * escape which does not start with 0 and which when
11882                          * evaluated as decimal could refer to an already
11883                          * parsed capture buffer is a backslash. Anything else
11884                          * is octal.
11885                          *
11886                          * Note this implies that \118 could be interpreted as
11887                          * 118 OR as "\11" . "8" depending on whether there
11888                          * were 118 capture buffers defined already in the
11889                          * pattern.  */
11890
11891                         /* NOTE, RExC_npar is 1 more than the actual number of
11892                          * parens we have seen so far, hence the < RExC_npar below. */
11893
11894                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11895                         {  /* Not to be treated as an octal constant, go
11896                                    find backref */
11897                             --p;
11898                             goto loopdone;
11899                         }
11900                         /* FALLTHROUGH */
11901                     case '0':
11902                         {
11903                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11904                             STRLEN numlen = 3;
11905                             ender = grok_oct(p, &numlen, &flags, NULL);
11906                             if (ender > 0xff) {
11907                                 REQUIRE_UTF8;
11908                             }
11909                             p += numlen;
11910                             if (SIZE_ONLY   /* like \08, \178 */
11911                                 && numlen < 3
11912                                 && p < RExC_end
11913                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11914                             {
11915                                 reg_warn_non_literal_string(
11916                                          p + 1,
11917                                          form_short_octal_warning(p, numlen));
11918                             }
11919                         }
11920                         if (PL_encoding && ender < 0x100)
11921                             goto recode_encoding;
11922                         break;
11923                     recode_encoding:
11924                         if (! RExC_override_recoding) {
11925                             SV* enc = PL_encoding;
11926                             ender = reg_recode((const char)(U8)ender, &enc);
11927                             if (!enc && SIZE_ONLY)
11928                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11929                             REQUIRE_UTF8;
11930                         }
11931                         break;
11932                     case '\0':
11933                         if (p >= RExC_end)
11934                             FAIL("Trailing \\");
11935                         /* FALL THROUGH */
11936                     default:
11937                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11938                             /* Include any { following the alpha to emphasize
11939                              * that it could be part of an escape at some point
11940                              * in the future */
11941                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11942                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11943                         }
11944                         goto normal_default;
11945                     } /* End of switch on '\' */
11946                     break;
11947                 default:    /* A literal character */
11948
11949                     if (! SIZE_ONLY
11950                         && RExC_flags & RXf_PMf_EXTENDED
11951                         && ckWARN_d(WARN_DEPRECATED)
11952                         && is_PATWS_non_low_safe(p, RExC_end, UTF))
11953                     {
11954                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11955                                 "Escape literal pattern white space under /x");
11956                     }
11957
11958                   normal_default:
11959                     if (UTF8_IS_START(*p) && UTF) {
11960                         STRLEN numlen;
11961                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11962                                                &numlen, UTF8_ALLOW_DEFAULT);
11963                         p += numlen;
11964                     }
11965                     else
11966                         ender = (U8) *p++;
11967                     break;
11968                 } /* End of switch on the literal */
11969
11970                 /* Here, have looked at the literal character and <ender>
11971                  * contains its ordinal, <p> points to the character after it
11972                  */
11973
11974                 if ( RExC_flags & RXf_PMf_EXTENDED)
11975                     p = regwhite( pRExC_state, p );
11976
11977                 /* If the next thing is a quantifier, it applies to this
11978                  * character only, which means that this character has to be in
11979                  * its own node and can't just be appended to the string in an
11980                  * existing node, so if there are already other characters in
11981                  * the node, close the node with just them, and set up to do
11982                  * this character again next time through, when it will be the
11983                  * only thing in its new node */
11984                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11985                 {
11986                     p = oldp;
11987                     goto loopdone;
11988                 }
11989
11990                 if (! FOLD   /* The simple case, just append the literal */
11991                     || (LOC  /* Also don't fold for tricky chars under /l */
11992                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
11993                 {
11994                     if (UTF) {
11995                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11996                         if (unilen > 0) {
11997                            s   += unilen;
11998                            len += unilen;
11999                         }
12000
12001                         /* The loop increments <len> each time, as all but this
12002                          * path (and one other) through it add a single byte to
12003                          * the EXACTish node.  But this one has changed len to
12004                          * be the correct final value, so subtract one to
12005                          * cancel out the increment that follows */
12006                         len--;
12007                     }
12008                     else {
12009                         REGC((char)ender, s++);
12010                     }
12011
12012                     /* Can get here if folding only if is one of the /l
12013                      * characters whose fold depends on the locale.  The
12014                      * occurrence of any of these indicate that we can't
12015                      * simplify things */
12016                     if (FOLD) {
12017                         maybe_exact = FALSE;
12018                         maybe_exactfu = FALSE;
12019                     }
12020                 }
12021                 else             /* FOLD */
12022                      if (! ( UTF
12023                         /* See comments for join_exact() as to why we fold this
12024                          * non-UTF at compile time */
12025                         || (node_type == EXACTFU
12026                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12027                 {
12028                     /* Here, are folding and are not UTF-8 encoded; therefore
12029                      * the character must be in the range 0-255, and is not /l
12030                      * (Not /l because we already handled these under /l in
12031                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12032                     if (IS_IN_SOME_FOLD_L1(ender)) {
12033                         maybe_exact = FALSE;
12034
12035                         /* See if the character's fold differs between /d and
12036                          * /u.  This includes the multi-char fold SHARP S to
12037                          * 'ss' */
12038                         if (maybe_exactfu
12039                             && (PL_fold[ender] != PL_fold_latin1[ender]
12040                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12041                                 || (len > 0
12042                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12043                                    && isARG2_lower_or_UPPER_ARG1('s',
12044                                                                  *(s-1)))))
12045                         {
12046                             maybe_exactfu = FALSE;
12047                         }
12048                     }
12049
12050                     /* Even when folding, we store just the input character, as
12051                      * we have an array that finds its fold quickly */
12052                     *(s++) = (char) ender;
12053                 }
12054                 else {  /* FOLD and UTF */
12055                     /* Unlike the non-fold case, we do actually have to
12056                      * calculate the results here in pass 1.  This is for two
12057                      * reasons, the folded length may be longer than the
12058                      * unfolded, and we have to calculate how many EXACTish
12059                      * nodes it will take; and we may run out of room in a node
12060                      * in the middle of a potential multi-char fold, and have
12061                      * to back off accordingly.  (Hence we can't use REGC for
12062                      * the simple case just below.) */
12063
12064                     UV folded;
12065                     if (isASCII(ender)) {
12066                         folded = toFOLD(ender);
12067                         *(s)++ = (U8) folded;
12068                     }
12069                     else {
12070                         STRLEN foldlen;
12071
12072                         folded = _to_uni_fold_flags(
12073                                      ender,
12074                                      (U8 *) s,
12075                                      &foldlen,
12076                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12077                                                         ? FOLD_FLAGS_NOMIX_ASCII
12078                                                         : 0));
12079                         s += foldlen;
12080
12081                         /* The loop increments <len> each time, as all but this
12082                          * path (and one other) through it add a single byte to
12083                          * the EXACTish node.  But this one has changed len to
12084                          * be the correct final value, so subtract one to
12085                          * cancel out the increment that follows */
12086                         len += foldlen - 1;
12087                     }
12088                     /* If this node only contains non-folding code points so
12089                      * far, see if this new one is also non-folding */
12090                     if (maybe_exact) {
12091                         if (folded != ender) {
12092                             maybe_exact = FALSE;
12093                         }
12094                         else {
12095                             /* Here the fold is the original; we have to check
12096                              * further to see if anything folds to it */
12097                             if (_invlist_contains_cp(PL_utf8_foldable,
12098                                                         ender))
12099                             {
12100                                 maybe_exact = FALSE;
12101                             }
12102                         }
12103                     }
12104                     ender = folded;
12105                 }
12106
12107                 if (next_is_quantifier) {
12108
12109                     /* Here, the next input is a quantifier, and to get here,
12110                      * the current character is the only one in the node.
12111                      * Also, here <len> doesn't include the final byte for this
12112                      * character */
12113                     len++;
12114                     goto loopdone;
12115                 }
12116
12117             } /* End of loop through literal characters */
12118
12119             /* Here we have either exhausted the input or ran out of room in
12120              * the node.  (If we encountered a character that can't be in the
12121              * node, transfer is made directly to <loopdone>, and so we
12122              * wouldn't have fallen off the end of the loop.)  In the latter
12123              * case, we artificially have to split the node into two, because
12124              * we just don't have enough space to hold everything.  This
12125              * creates a problem if the final character participates in a
12126              * multi-character fold in the non-final position, as a match that
12127              * should have occurred won't, due to the way nodes are matched,
12128              * and our artificial boundary.  So back off until we find a non-
12129              * problematic character -- one that isn't at the beginning or
12130              * middle of such a fold.  (Either it doesn't participate in any
12131              * folds, or appears only in the final position of all the folds it
12132              * does participate in.)  A better solution with far fewer false
12133              * positives, and that would fill the nodes more completely, would
12134              * be to actually have available all the multi-character folds to
12135              * test against, and to back-off only far enough to be sure that
12136              * this node isn't ending with a partial one.  <upper_parse> is set
12137              * further below (if we need to reparse the node) to include just
12138              * up through that final non-problematic character that this code
12139              * identifies, so when it is set to less than the full node, we can
12140              * skip the rest of this */
12141             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12142
12143                 const STRLEN full_len = len;
12144
12145                 assert(len >= MAX_NODE_STRING_SIZE);
12146
12147                 /* Here, <s> points to the final byte of the final character.
12148                  * Look backwards through the string until find a non-
12149                  * problematic character */
12150
12151                 if (! UTF) {
12152
12153                     /* This has no multi-char folds to non-UTF characters */
12154                     if (ASCII_FOLD_RESTRICTED) {
12155                         goto loopdone;
12156                     }
12157
12158                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12159                     len = s - s0 + 1;
12160                 }
12161                 else {
12162                     if (!  PL_NonL1NonFinalFold) {
12163                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12164                                         NonL1_Perl_Non_Final_Folds_invlist);
12165                     }
12166
12167                     /* Point to the first byte of the final character */
12168                     s = (char *) utf8_hop((U8 *) s, -1);
12169
12170                     while (s >= s0) {   /* Search backwards until find
12171                                            non-problematic char */
12172                         if (UTF8_IS_INVARIANT(*s)) {
12173
12174                             /* There are no ascii characters that participate
12175                              * in multi-char folds under /aa.  In EBCDIC, the
12176                              * non-ascii invariants are all control characters,
12177                              * so don't ever participate in any folds. */
12178                             if (ASCII_FOLD_RESTRICTED
12179                                 || ! IS_NON_FINAL_FOLD(*s))
12180                             {
12181                                 break;
12182                             }
12183                         }
12184                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12185                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12186                                                                   *s, *(s+1))))
12187                             {
12188                                 break;
12189                             }
12190                         }
12191                         else if (! _invlist_contains_cp(
12192                                         PL_NonL1NonFinalFold,
12193                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12194                         {
12195                             break;
12196                         }
12197
12198                         /* Here, the current character is problematic in that
12199                          * it does occur in the non-final position of some
12200                          * fold, so try the character before it, but have to
12201                          * special case the very first byte in the string, so
12202                          * we don't read outside the string */
12203                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12204                     } /* End of loop backwards through the string */
12205
12206                     /* If there were only problematic characters in the string,
12207                      * <s> will point to before s0, in which case the length
12208                      * should be 0, otherwise include the length of the
12209                      * non-problematic character just found */
12210                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12211                 }
12212
12213                 /* Here, have found the final character, if any, that is
12214                  * non-problematic as far as ending the node without splitting
12215                  * it across a potential multi-char fold.  <len> contains the
12216                  * number of bytes in the node up-to and including that
12217                  * character, or is 0 if there is no such character, meaning
12218                  * the whole node contains only problematic characters.  In
12219                  * this case, give up and just take the node as-is.  We can't
12220                  * do any better */
12221                 if (len == 0) {
12222                     len = full_len;
12223
12224                     /* If the node ends in an 's' we make sure it stays EXACTF,
12225                      * as if it turns into an EXACTFU, it could later get
12226                      * joined with another 's' that would then wrongly match
12227                      * the sharp s */
12228                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12229                     {
12230                         maybe_exactfu = FALSE;
12231                     }
12232                 } else {
12233
12234                     /* Here, the node does contain some characters that aren't
12235                      * problematic.  If one such is the final character in the
12236                      * node, we are done */
12237                     if (len == full_len) {
12238                         goto loopdone;
12239                     }
12240                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12241
12242                         /* If the final character is problematic, but the
12243                          * penultimate is not, back-off that last character to
12244                          * later start a new node with it */
12245                         p = oldp;
12246                         goto loopdone;
12247                     }
12248
12249                     /* Here, the final non-problematic character is earlier
12250                      * in the input than the penultimate character.  What we do
12251                      * is reparse from the beginning, going up only as far as
12252                      * this final ok one, thus guaranteeing that the node ends
12253                      * in an acceptable character.  The reason we reparse is
12254                      * that we know how far in the character is, but we don't
12255                      * know how to correlate its position with the input parse.
12256                      * An alternate implementation would be to build that
12257                      * correlation as we go along during the original parse,
12258                      * but that would entail extra work for every node, whereas
12259                      * this code gets executed only when the string is too
12260                      * large for the node, and the final two characters are
12261                      * problematic, an infrequent occurrence.  Yet another
12262                      * possible strategy would be to save the tail of the
12263                      * string, and the next time regatom is called, initialize
12264                      * with that.  The problem with this is that unless you
12265                      * back off one more character, you won't be guaranteed
12266                      * regatom will get called again, unless regbranch,
12267                      * regpiece ... are also changed.  If you do back off that
12268                      * extra character, so that there is input guaranteed to
12269                      * force calling regatom, you can't handle the case where
12270                      * just the first character in the node is acceptable.  I
12271                      * (khw) decided to try this method which doesn't have that
12272                      * pitfall; if performance issues are found, we can do a
12273                      * combination of the current approach plus that one */
12274                     upper_parse = len;
12275                     len = 0;
12276                     s = s0;
12277                     goto reparse;
12278                 }
12279             }   /* End of verifying node ends with an appropriate char */
12280
12281         loopdone:   /* Jumped to when encounters something that shouldn't be in
12282                        the node */
12283
12284             /* I (khw) don't know if you can get here with zero length, but the
12285              * old code handled this situation by creating a zero-length EXACT
12286              * node.  Might as well be NOTHING instead */
12287             if (len == 0) {
12288                 OP(ret) = NOTHING;
12289             }
12290             else {
12291                 if (FOLD) {
12292                     /* If 'maybe_exact' is still set here, means there are no
12293                      * code points in the node that participate in folds;
12294                      * similarly for 'maybe_exactfu' and code points that match
12295                      * differently depending on UTF8ness of the target string
12296                      * (for /u), or depending on locale for /l */
12297                     if (maybe_exact) {
12298                         OP(ret) = EXACT;
12299                     }
12300                     else if (maybe_exactfu) {
12301                         OP(ret) = EXACTFU;
12302                     }
12303                 }
12304                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12305                                            FALSE /* Don't look to see if could
12306                                                     be turned into an EXACT
12307                                                     node, as we have already
12308                                                     computed that */
12309                                           );
12310             }
12311
12312             RExC_parse = p - 1;
12313             Set_Node_Cur_Length(ret, parse_start);
12314             nextchar(pRExC_state);
12315             {
12316                 /* len is STRLEN which is unsigned, need to copy to signed */
12317                 IV iv = len;
12318                 if (iv < 0)
12319                     vFAIL("Internal disaster");
12320             }
12321
12322         } /* End of label 'defchar:' */
12323         break;
12324     } /* End of giant switch on input character */
12325
12326     return(ret);
12327 }
12328
12329 STATIC char *
12330 S_regwhite( RExC_state_t *pRExC_state, char *p )
12331 {
12332     const char *e = RExC_end;
12333
12334     PERL_ARGS_ASSERT_REGWHITE;
12335
12336     while (p < e) {
12337         if (isSPACE(*p))
12338             ++p;
12339         else if (*p == '#') {
12340             bool ended = 0;
12341             do {
12342                 if (*p++ == '\n') {
12343                     ended = 1;
12344                     break;
12345                 }
12346             } while (p < e);
12347             if (!ended)
12348                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12349         }
12350         else
12351             break;
12352     }
12353     return p;
12354 }
12355
12356 STATIC char *
12357 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12358 {
12359     /* Returns the next non-pattern-white space, non-comment character (the
12360      * latter only if 'recognize_comment is true) in the string p, which is
12361      * ended by RExC_end.  If there is no line break ending a comment,
12362      * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12363     const char *e = RExC_end;
12364
12365     PERL_ARGS_ASSERT_REGPATWS;
12366
12367     while (p < e) {
12368         STRLEN len;
12369         if ((len = is_PATWS_safe(p, e, UTF))) {
12370             p += len;
12371         }
12372         else if (recognize_comment && *p == '#') {
12373             bool ended = 0;
12374             do {
12375                 p++;
12376                 if (is_LNBREAK_safe(p, e, UTF)) {
12377                     ended = 1;
12378                     break;
12379                 }
12380             } while (p < e);
12381             if (!ended)
12382                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12383         }
12384         else
12385             break;
12386     }
12387     return p;
12388 }
12389
12390 STATIC void
12391 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12392 {
12393     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12394      * sets up the bitmap and any flags, removing those code points from the
12395      * inversion list, setting it to NULL should it become completely empty */
12396
12397     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12398     assert(PL_regkind[OP(node)] == ANYOF);
12399
12400     ANYOF_BITMAP_ZERO(node);
12401     if (*invlist_ptr) {
12402
12403         /* This gets set if we actually need to modify things */
12404         bool change_invlist = FALSE;
12405
12406         UV start, end;
12407
12408         /* Start looking through *invlist_ptr */
12409         invlist_iterinit(*invlist_ptr);
12410         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12411             UV high;
12412             int i;
12413
12414             if (end == UV_MAX && start <= 256) {
12415                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12416             }
12417             else if (end >= 256) {
12418                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12419             }
12420
12421             /* Quit if are above what we should change */
12422             if (start > 255) {
12423                 break;
12424             }
12425
12426             change_invlist = TRUE;
12427
12428             /* Set all the bits in the range, up to the max that we are doing */
12429             high = (end < 255) ? end : 255;
12430             for (i = start; i <= (int) high; i++) {
12431                 if (! ANYOF_BITMAP_TEST(node, i)) {
12432                     ANYOF_BITMAP_SET(node, i);
12433                 }
12434             }
12435         }
12436         invlist_iterfinish(*invlist_ptr);
12437
12438         /* Done with loop; remove any code points that are in the bitmap from
12439          * *invlist_ptr; similarly for code points above latin1 if we have a
12440          * flag to match all of them anyways */
12441         if (change_invlist) {
12442             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12443         }
12444         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12445             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12446         }
12447
12448         /* If have completely emptied it, remove it completely */
12449         if (_invlist_len(*invlist_ptr) == 0) {
12450             SvREFCNT_dec_NN(*invlist_ptr);
12451             *invlist_ptr = NULL;
12452         }
12453     }
12454 }
12455
12456 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12457    Character classes ([:foo:]) can also be negated ([:^foo:]).
12458    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12459    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12460    but trigger failures because they are currently unimplemented. */
12461
12462 #define POSIXCC_DONE(c)   ((c) == ':')
12463 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12464 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12465
12466 PERL_STATIC_INLINE I32
12467 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12468 {
12469     dVAR;
12470     I32 namedclass = OOB_NAMEDCLASS;
12471
12472     PERL_ARGS_ASSERT_REGPPOSIXCC;
12473
12474     if (value == '[' && RExC_parse + 1 < RExC_end &&
12475         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12476         POSIXCC(UCHARAT(RExC_parse)))
12477     {
12478         const char c = UCHARAT(RExC_parse);
12479         char* const s = RExC_parse++;
12480
12481         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12482             RExC_parse++;
12483         if (RExC_parse == RExC_end) {
12484             if (strict) {
12485
12486                 /* Try to give a better location for the error (than the end of
12487                  * the string) by looking for the matching ']' */
12488                 RExC_parse = s;
12489                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12490                     RExC_parse++;
12491                 }
12492                 vFAIL2("Unmatched '%c' in POSIX class", c);
12493             }
12494             /* Grandfather lone [:, [=, [. */
12495             RExC_parse = s;
12496         }
12497         else {
12498             const char* const t = RExC_parse++; /* skip over the c */
12499             assert(*t == c);
12500
12501             if (UCHARAT(RExC_parse) == ']') {
12502                 const char *posixcc = s + 1;
12503                 RExC_parse++; /* skip over the ending ] */
12504
12505                 if (*s == ':') {
12506                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12507                     const I32 skip = t - posixcc;
12508
12509                     /* Initially switch on the length of the name.  */
12510                     switch (skip) {
12511                     case 4:
12512                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12513                                                           this is the Perl \w
12514                                                         */
12515                             namedclass = ANYOF_WORDCHAR;
12516                         break;
12517                     case 5:
12518                         /* Names all of length 5.  */
12519                         /* alnum alpha ascii blank cntrl digit graph lower
12520                            print punct space upper  */
12521                         /* Offset 4 gives the best switch position.  */
12522                         switch (posixcc[4]) {
12523                         case 'a':
12524                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12525                                 namedclass = ANYOF_ALPHA;
12526                             break;
12527                         case 'e':
12528                             if (memEQ(posixcc, "spac", 4)) /* space */
12529                                 namedclass = ANYOF_PSXSPC;
12530                             break;
12531                         case 'h':
12532                             if (memEQ(posixcc, "grap", 4)) /* graph */
12533                                 namedclass = ANYOF_GRAPH;
12534                             break;
12535                         case 'i':
12536                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12537                                 namedclass = ANYOF_ASCII;
12538                             break;
12539                         case 'k':
12540                             if (memEQ(posixcc, "blan", 4)) /* blank */
12541                                 namedclass = ANYOF_BLANK;
12542                             break;
12543                         case 'l':
12544                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12545                                 namedclass = ANYOF_CNTRL;
12546                             break;
12547                         case 'm':
12548                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12549                                 namedclass = ANYOF_ALPHANUMERIC;
12550                             break;
12551                         case 'r':
12552                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12553                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12554                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12555                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12556                             break;
12557                         case 't':
12558                             if (memEQ(posixcc, "digi", 4)) /* digit */
12559                                 namedclass = ANYOF_DIGIT;
12560                             else if (memEQ(posixcc, "prin", 4)) /* print */
12561                                 namedclass = ANYOF_PRINT;
12562                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12563                                 namedclass = ANYOF_PUNCT;
12564                             break;
12565                         }
12566                         break;
12567                     case 6:
12568                         if (memEQ(posixcc, "xdigit", 6))
12569                             namedclass = ANYOF_XDIGIT;
12570                         break;
12571                     }
12572
12573                     if (namedclass == OOB_NAMEDCLASS)
12574                         vFAIL2utf8f(
12575                             "POSIX class [:%"UTF8f":] unknown",
12576                             UTF8fARG(UTF, t - s - 1, s + 1));
12577
12578                     /* The #defines are structured so each complement is +1 to
12579                      * the normal one */
12580                     if (complement) {
12581                         namedclass++;
12582                     }
12583                     assert (posixcc[skip] == ':');
12584                     assert (posixcc[skip+1] == ']');
12585                 } else if (!SIZE_ONLY) {
12586                     /* [[=foo=]] and [[.foo.]] are still future. */
12587
12588                     /* adjust RExC_parse so the warning shows after
12589                        the class closes */
12590                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12591                         RExC_parse++;
12592                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12593                 }
12594             } else {
12595                 /* Maternal grandfather:
12596                  * "[:" ending in ":" but not in ":]" */
12597                 if (strict) {
12598                     vFAIL("Unmatched '[' in POSIX class");
12599                 }
12600
12601                 /* Grandfather lone [:, [=, [. */
12602                 RExC_parse = s;
12603             }
12604         }
12605     }
12606
12607     return namedclass;
12608 }
12609
12610 STATIC bool
12611 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12612 {
12613     /* This applies some heuristics at the current parse position (which should
12614      * be at a '[') to see if what follows might be intended to be a [:posix:]
12615      * class.  It returns true if it really is a posix class, of course, but it
12616      * also can return true if it thinks that what was intended was a posix
12617      * class that didn't quite make it.
12618      *
12619      * It will return true for
12620      *      [:alphanumerics:
12621      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12622      *                         ')' indicating the end of the (?[
12623      *      [:any garbage including %^&$ punctuation:]
12624      *
12625      * This is designed to be called only from S_handle_regex_sets; it could be
12626      * easily adapted to be called from the spot at the beginning of regclass()
12627      * that checks to see in a normal bracketed class if the surrounding []
12628      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12629      * change long-standing behavior, so I (khw) didn't do that */
12630     char* p = RExC_parse + 1;
12631     char first_char = *p;
12632
12633     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12634
12635     assert(*(p - 1) == '[');
12636
12637     if (! POSIXCC(first_char)) {
12638         return FALSE;
12639     }
12640
12641     p++;
12642     while (p < RExC_end && isWORDCHAR(*p)) p++;
12643
12644     if (p >= RExC_end) {
12645         return FALSE;
12646     }
12647
12648     if (p - RExC_parse > 2    /* Got at least 1 word character */
12649         && (*p == first_char
12650             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12651     {
12652         return TRUE;
12653     }
12654
12655     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12656
12657     return (p
12658             && p - RExC_parse > 2 /* [:] evaluates to colon;
12659                                       [::] is a bad posix class. */
12660             && first_char == *(p - 1));
12661 }
12662
12663 STATIC regnode *
12664 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12665                     I32 *flagp, U32 depth,
12666                     char * const oregcomp_parse)
12667 {
12668     /* Handle the (?[...]) construct to do set operations */
12669
12670     U8 curchar;
12671     UV start, end;      /* End points of code point ranges */
12672     SV* result_string;
12673     char *save_end, *save_parse;
12674     SV* final;
12675     STRLEN len;
12676     regnode* node;
12677     AV* stack;
12678     const bool save_fold = FOLD;
12679
12680     GET_RE_DEBUG_FLAGS_DECL;
12681
12682     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12683
12684     if (LOC) {
12685         vFAIL("(?[...]) not valid in locale");
12686     }
12687     RExC_uni_semantics = 1;
12688
12689     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12690      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12691      * call regclass to handle '[]' so as to not have to reinvent its parsing
12692      * rules here (throwing away the size it computes each time).  And, we exit
12693      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12694      * these things, we need to realize that something preceded by a backslash
12695      * is escaped, so we have to keep track of backslashes */
12696     if (SIZE_ONLY) {
12697         UV depth = 0; /* how many nested (?[...]) constructs */
12698
12699         Perl_ck_warner_d(aTHX_
12700             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12701             "The regex_sets feature is experimental" REPORT_LOCATION,
12702                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12703                 UTF8fARG(UTF,
12704                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12705                          RExC_precomp + (RExC_parse - RExC_precomp)));
12706
12707         while (RExC_parse < RExC_end) {
12708             SV* current = NULL;
12709             RExC_parse = regpatws(pRExC_state, RExC_parse,
12710                                 TRUE); /* means recognize comments */
12711             switch (*RExC_parse) {
12712                 case '?':
12713                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12714                     /* FALL THROUGH */
12715                 default:
12716                     break;
12717                 case '\\':
12718                     /* Skip the next byte (which could cause us to end up in
12719                      * the middle of a UTF-8 character, but since none of those
12720                      * are confusable with anything we currently handle in this
12721                      * switch (invariants all), it's safe.  We'll just hit the
12722                      * default: case next time and keep on incrementing until
12723                      * we find one of the invariants we do handle. */
12724                     RExC_parse++;
12725                     break;
12726                 case '[':
12727                 {
12728                     /* If this looks like it is a [:posix:] class, leave the
12729                      * parse pointer at the '[' to fool regclass() into
12730                      * thinking it is part of a '[[:posix:]]'.  That function
12731                      * will use strict checking to force a syntax error if it
12732                      * doesn't work out to a legitimate class */
12733                     bool is_posix_class
12734                                     = could_it_be_a_POSIX_class(pRExC_state);
12735                     if (! is_posix_class) {
12736                         RExC_parse++;
12737                     }
12738
12739                     /* regclass() can only return RESTART_UTF8 if multi-char
12740                        folds are allowed.  */
12741                     if (!regclass(pRExC_state, flagp,depth+1,
12742                                   is_posix_class, /* parse the whole char
12743                                                      class only if not a
12744                                                      posix class */
12745                                   FALSE, /* don't allow multi-char folds */
12746                                   TRUE, /* silence non-portable warnings. */
12747                                   &current))
12748                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12749                               (UV) *flagp);
12750
12751                     /* function call leaves parse pointing to the ']', except
12752                      * if we faked it */
12753                     if (is_posix_class) {
12754                         RExC_parse--;
12755                     }
12756
12757                     SvREFCNT_dec(current);   /* In case it returned something */
12758                     break;
12759                 }
12760
12761                 case ']':
12762                     if (depth--) break;
12763                     RExC_parse++;
12764                     if (RExC_parse < RExC_end
12765                         && *RExC_parse == ')')
12766                     {
12767                         node = reganode(pRExC_state, ANYOF, 0);
12768                         RExC_size += ANYOF_SKIP;
12769                         nextchar(pRExC_state);
12770                         Set_Node_Length(node,
12771                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12772                         return node;
12773                     }
12774                     goto no_close;
12775             }
12776             RExC_parse++;
12777         }
12778
12779         no_close:
12780         FAIL("Syntax error in (?[...])");
12781     }
12782
12783     /* Pass 2 only after this.  Everything in this construct is a
12784      * metacharacter.  Operands begin with either a '\' (for an escape
12785      * sequence), or a '[' for a bracketed character class.  Any other
12786      * character should be an operator, or parenthesis for grouping.  Both
12787      * types of operands are handled by calling regclass() to parse them.  It
12788      * is called with a parameter to indicate to return the computed inversion
12789      * list.  The parsing here is implemented via a stack.  Each entry on the
12790      * stack is a single character representing one of the operators, or the
12791      * '('; or else a pointer to an operand inversion list. */
12792
12793 #define IS_OPERAND(a)  (! SvIOK(a))
12794
12795     /* The stack starts empty.  It is a syntax error if the first thing parsed
12796      * is a binary operator; everything else is pushed on the stack.  When an
12797      * operand is parsed, the top of the stack is examined.  If it is a binary
12798      * operator, the item before it should be an operand, and both are replaced
12799      * by the result of doing that operation on the new operand and the one on
12800      * the stack.   Thus a sequence of binary operands is reduced to a single
12801      * one before the next one is parsed.
12802      *
12803      * A unary operator may immediately follow a binary in the input, for
12804      * example
12805      *      [a] + ! [b]
12806      * When an operand is parsed and the top of the stack is a unary operator,
12807      * the operation is performed, and then the stack is rechecked to see if
12808      * this new operand is part of a binary operation; if so, it is handled as
12809      * above.
12810      *
12811      * A '(' is simply pushed on the stack; it is valid only if the stack is
12812      * empty, or the top element of the stack is an operator or another '('
12813      * (for which the parenthesized expression will become an operand).  By the
12814      * time the corresponding ')' is parsed everything in between should have
12815      * been parsed and evaluated to a single operand (or else is a syntax
12816      * error), and is handled as a regular operand */
12817
12818     sv_2mortal((SV *)(stack = newAV()));
12819
12820     while (RExC_parse < RExC_end) {
12821         I32 top_index = av_tindex(stack);
12822         SV** top_ptr;
12823         SV* current = NULL;
12824
12825         /* Skip white space */
12826         RExC_parse = regpatws(pRExC_state, RExC_parse,
12827                                 TRUE); /* means recognize comments */
12828         if (RExC_parse >= RExC_end) {
12829             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12830         }
12831         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12832             break;
12833         }
12834
12835         switch (curchar) {
12836
12837             case '?':
12838                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12839                                                safely subtract 1 from
12840                                                RExC_parse in the next clause.
12841                                                If we have something on the
12842                                                stack, we have parsed something
12843                                              */
12844                     && UCHARAT(RExC_parse - 1) == '('
12845                     && RExC_parse < RExC_end)
12846                 {
12847                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12848                      * This happens when we have some thing like
12849                      *
12850                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12851                      *   ...
12852                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12853                      *
12854                      * Here we would be handling the interpolated
12855                      * '$thai_or_lao'.  We handle this by a recursive call to
12856                      * ourselves which returns the inversion list the
12857                      * interpolated expression evaluates to.  We use the flags
12858                      * from the interpolated pattern. */
12859                     U32 save_flags = RExC_flags;
12860                     const char * const save_parse = ++RExC_parse;
12861
12862                     parse_lparen_question_flags(pRExC_state);
12863
12864                     if (RExC_parse == save_parse  /* Makes sure there was at
12865                                                      least one flag (or this
12866                                                      embedding wasn't compiled)
12867                                                    */
12868                         || RExC_parse >= RExC_end - 4
12869                         || UCHARAT(RExC_parse) != ':'
12870                         || UCHARAT(++RExC_parse) != '('
12871                         || UCHARAT(++RExC_parse) != '?'
12872                         || UCHARAT(++RExC_parse) != '[')
12873                     {
12874
12875                         /* In combination with the above, this moves the
12876                          * pointer to the point just after the first erroneous
12877                          * character (or if there are no flags, to where they
12878                          * should have been) */
12879                         if (RExC_parse >= RExC_end - 4) {
12880                             RExC_parse = RExC_end;
12881                         }
12882                         else if (RExC_parse != save_parse) {
12883                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12884                         }
12885                         vFAIL("Expecting '(?flags:(?[...'");
12886                     }
12887                     RExC_parse++;
12888                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12889                                                     depth+1, oregcomp_parse);
12890
12891                     /* Here, 'current' contains the embedded expression's
12892                      * inversion list, and RExC_parse points to the trailing
12893                      * ']'; the next character should be the ')' which will be
12894                      * paired with the '(' that has been put on the stack, so
12895                      * the whole embedded expression reduces to '(operand)' */
12896                     RExC_parse++;
12897
12898                     RExC_flags = save_flags;
12899                     goto handle_operand;
12900                 }
12901                 /* FALL THROUGH */
12902
12903             default:
12904                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12905                 vFAIL("Unexpected character");
12906
12907             case '\\':
12908                 /* regclass() can only return RESTART_UTF8 if multi-char
12909                    folds are allowed.  */
12910                 if (!regclass(pRExC_state, flagp,depth+1,
12911                               TRUE, /* means parse just the next thing */
12912                               FALSE, /* don't allow multi-char folds */
12913                               FALSE, /* don't silence non-portable warnings.  */
12914                               &current))
12915                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12916                           (UV) *flagp);
12917                 /* regclass() will return with parsing just the \ sequence,
12918                  * leaving the parse pointer at the next thing to parse */
12919                 RExC_parse--;
12920                 goto handle_operand;
12921
12922             case '[':   /* Is a bracketed character class */
12923             {
12924                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12925
12926                 if (! is_posix_class) {
12927                     RExC_parse++;
12928                 }
12929
12930                 /* regclass() can only return RESTART_UTF8 if multi-char
12931                    folds are allowed.  */
12932                 if(!regclass(pRExC_state, flagp,depth+1,
12933                              is_posix_class, /* parse the whole char class
12934                                                 only if not a posix class */
12935                              FALSE, /* don't allow multi-char folds */
12936                              FALSE, /* don't silence non-portable warnings.  */
12937                              &current))
12938                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12939                           (UV) *flagp);
12940                 /* function call leaves parse pointing to the ']', except if we
12941                  * faked it */
12942                 if (is_posix_class) {
12943                     RExC_parse--;
12944                 }
12945
12946                 goto handle_operand;
12947             }
12948
12949             case '&':
12950             case '|':
12951             case '+':
12952             case '-':
12953             case '^':
12954                 if (top_index < 0
12955                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12956                     || ! IS_OPERAND(*top_ptr))
12957                 {
12958                     RExC_parse++;
12959                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12960                 }
12961                 av_push(stack, newSVuv(curchar));
12962                 break;
12963
12964             case '!':
12965                 av_push(stack, newSVuv(curchar));
12966                 break;
12967
12968             case '(':
12969                 if (top_index >= 0) {
12970                     top_ptr = av_fetch(stack, top_index, FALSE);
12971                     assert(top_ptr);
12972                     if (IS_OPERAND(*top_ptr)) {
12973                         RExC_parse++;
12974                         vFAIL("Unexpected '(' with no preceding operator");
12975                     }
12976                 }
12977                 av_push(stack, newSVuv(curchar));
12978                 break;
12979
12980             case ')':
12981             {
12982                 SV* lparen;
12983                 if (top_index < 1
12984                     || ! (current = av_pop(stack))
12985                     || ! IS_OPERAND(current)
12986                     || ! (lparen = av_pop(stack))
12987                     || IS_OPERAND(lparen)
12988                     || SvUV(lparen) != '(')
12989                 {
12990                     SvREFCNT_dec(current);
12991                     RExC_parse++;
12992                     vFAIL("Unexpected ')'");
12993                 }
12994                 top_index -= 2;
12995                 SvREFCNT_dec_NN(lparen);
12996
12997                 /* FALL THROUGH */
12998             }
12999
13000               handle_operand:
13001
13002                 /* Here, we have an operand to process, in 'current' */
13003
13004                 if (top_index < 0) {    /* Just push if stack is empty */
13005                     av_push(stack, current);
13006                 }
13007                 else {
13008                     SV* top = av_pop(stack);
13009                     SV *prev = NULL;
13010                     char current_operator;
13011
13012                     if (IS_OPERAND(top)) {
13013                         SvREFCNT_dec_NN(top);
13014                         SvREFCNT_dec_NN(current);
13015                         vFAIL("Operand with no preceding operator");
13016                     }
13017                     current_operator = (char) SvUV(top);
13018                     switch (current_operator) {
13019                         case '(':   /* Push the '(' back on followed by the new
13020                                        operand */
13021                             av_push(stack, top);
13022                             av_push(stack, current);
13023                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13024                                                    just after the 'break', so
13025                                                    it doesn't get wrongly freed
13026                                                  */
13027                             break;
13028
13029                         case '!':
13030                             _invlist_invert(current);
13031
13032                             /* Unlike binary operators, the top of the stack,
13033                              * now that this unary one has been popped off, may
13034                              * legally be an operator, and we now have operand
13035                              * for it. */
13036                             top_index--;
13037                             SvREFCNT_dec_NN(top);
13038                             goto handle_operand;
13039
13040                         case '&':
13041                             prev = av_pop(stack);
13042                             _invlist_intersection(prev,
13043                                                    current,
13044                                                    &current);
13045                             av_push(stack, current);
13046                             break;
13047
13048                         case '|':
13049                         case '+':
13050                             prev = av_pop(stack);
13051                             _invlist_union(prev, current, &current);
13052                             av_push(stack, current);
13053                             break;
13054
13055                         case '-':
13056                             prev = av_pop(stack);;
13057                             _invlist_subtract(prev, current, &current);
13058                             av_push(stack, current);
13059                             break;
13060
13061                         case '^':   /* The union minus the intersection */
13062                         {
13063                             SV* i = NULL;
13064                             SV* u = NULL;
13065                             SV* element;
13066
13067                             prev = av_pop(stack);
13068                             _invlist_union(prev, current, &u);
13069                             _invlist_intersection(prev, current, &i);
13070                             /* _invlist_subtract will overwrite current
13071                                 without freeing what it already contains */
13072                             element = current;
13073                             _invlist_subtract(u, i, &current);
13074                             av_push(stack, current);
13075                             SvREFCNT_dec_NN(i);
13076                             SvREFCNT_dec_NN(u);
13077                             SvREFCNT_dec_NN(element);
13078                             break;
13079                         }
13080
13081                         default:
13082                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13083                 }
13084                 SvREFCNT_dec_NN(top);
13085                 SvREFCNT_dec(prev);
13086             }
13087         }
13088
13089         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13090     }
13091
13092     if (av_tindex(stack) < 0   /* Was empty */
13093         || ((final = av_pop(stack)) == NULL)
13094         || ! IS_OPERAND(final)
13095         || av_tindex(stack) >= 0)  /* More left on stack */
13096     {
13097         vFAIL("Incomplete expression within '(?[ ])'");
13098     }
13099
13100     /* Here, 'final' is the resultant inversion list from evaluating the
13101      * expression.  Return it if so requested */
13102     if (return_invlist) {
13103         *return_invlist = final;
13104         return END;
13105     }
13106
13107     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13108      * expecting a string of ranges and individual code points */
13109     invlist_iterinit(final);
13110     result_string = newSVpvs("");
13111     while (invlist_iternext(final, &start, &end)) {
13112         if (start == end) {
13113             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13114         }
13115         else {
13116             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13117                                                      start,          end);
13118         }
13119     }
13120
13121     save_parse = RExC_parse;
13122     RExC_parse = SvPV(result_string, len);
13123     save_end = RExC_end;
13124     RExC_end = RExC_parse + len;
13125
13126     /* We turn off folding around the call, as the class we have constructed
13127      * already has all folding taken into consideration, and we don't want
13128      * regclass() to add to that */
13129     RExC_flags &= ~RXf_PMf_FOLD;
13130     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13131      */
13132     node = regclass(pRExC_state, flagp,depth+1,
13133                     FALSE, /* means parse the whole char class */
13134                     FALSE, /* don't allow multi-char folds */
13135                     TRUE, /* silence non-portable warnings.  The above may very
13136                              well have generated non-portable code points, but
13137                              they're valid on this machine */
13138                     NULL);
13139     if (!node)
13140         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13141                     PTR2UV(flagp));
13142     if (save_fold) {
13143         RExC_flags |= RXf_PMf_FOLD;
13144     }
13145     RExC_parse = save_parse + 1;
13146     RExC_end = save_end;
13147     SvREFCNT_dec_NN(final);
13148     SvREFCNT_dec_NN(result_string);
13149
13150     nextchar(pRExC_state);
13151     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13152     return node;
13153 }
13154 #undef IS_OPERAND
13155
13156 /* The names of properties whose definitions are not known at compile time are
13157  * stored in this SV, after a constant heading.  So if the length has been
13158  * changed since initialization, then there is a run-time definition. */
13159 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13160                                         (SvCUR(listsv) != initial_listsv_len)
13161
13162 STATIC regnode *
13163 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13164                  const bool stop_at_1,  /* Just parse the next thing, don't
13165                                            look for a full character class */
13166                  bool allow_multi_folds,
13167                  const bool silence_non_portable,   /* Don't output warnings
13168                                                        about too large
13169                                                        characters */
13170                  SV** ret_invlist)  /* Return an inversion list, not a node */
13171 {
13172     /* parse a bracketed class specification.  Most of these will produce an
13173      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13174      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13175      * under /i with multi-character folds: it will be rewritten following the
13176      * paradigm of this example, where the <multi-fold>s are characters which
13177      * fold to multiple character sequences:
13178      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13179      * gets effectively rewritten as:
13180      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13181      * reg() gets called (recursively) on the rewritten version, and this
13182      * function will return what it constructs.  (Actually the <multi-fold>s
13183      * aren't physically removed from the [abcdefghi], it's just that they are
13184      * ignored in the recursion by means of a flag:
13185      * <RExC_in_multi_char_class>.)
13186      *
13187      * ANYOF nodes contain a bit map for the first 256 characters, with the
13188      * corresponding bit set if that character is in the list.  For characters
13189      * above 255, a range list or swash is used.  There are extra bits for \w,
13190      * etc. in locale ANYOFs, as what these match is not determinable at
13191      * compile time
13192      *
13193      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13194      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13195      */
13196
13197     dVAR;
13198     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13199     IV range = 0;
13200     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13201     regnode *ret;
13202     STRLEN numlen;
13203     IV namedclass = OOB_NAMEDCLASS;
13204     char *rangebegin = NULL;
13205     bool need_class = 0;
13206     SV *listsv = NULL;
13207     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13208                                       than just initialized.  */
13209     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13210     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13211                                extended beyond the Latin1 range.  These have to
13212                                be kept separate from other code points for much
13213                                of this function because their handling  is
13214                                different under /i, and for most classes under
13215                                /d as well */
13216     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13217                                separate for a while from the non-complemented
13218                                versions because of complications with /d
13219                                matching */
13220     UV element_count = 0;   /* Number of distinct elements in the class.
13221                                Optimizations may be possible if this is tiny */
13222     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13223                                        character; used under /i */
13224     UV n;
13225     char * stop_ptr = RExC_end;    /* where to stop parsing */
13226     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13227                                                    space? */
13228     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13229
13230     /* Unicode properties are stored in a swash; this holds the current one
13231      * being parsed.  If this swash is the only above-latin1 component of the
13232      * character class, an optimization is to pass it directly on to the
13233      * execution engine.  Otherwise, it is set to NULL to indicate that there
13234      * are other things in the class that have to be dealt with at execution
13235      * time */
13236     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13237
13238     /* Set if a component of this character class is user-defined; just passed
13239      * on to the engine */
13240     bool has_user_defined_property = FALSE;
13241
13242     /* inversion list of code points this node matches only when the target
13243      * string is in UTF-8.  (Because is under /d) */
13244     SV* depends_list = NULL;
13245
13246     /* Inversion list of code points this node matches regardless of things
13247      * like locale, folding, utf8ness of the target string */
13248     SV* cp_list = NULL;
13249
13250     /* Like cp_list, but code points on this list need to be checked for things
13251      * that fold to/from them under /i */
13252     SV* cp_foldable_list = NULL;
13253
13254     /* Like cp_list, but code points on this list are valid only when the
13255      * runtime locale is UTF-8 */
13256     SV* only_utf8_locale_list = NULL;
13257
13258 #ifdef EBCDIC
13259     /* In a range, counts how many 0-2 of the ends of it came from literals,
13260      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13261     UV literal_endpoint = 0;
13262 #endif
13263     bool invert = FALSE;    /* Is this class to be complemented */
13264
13265     bool warn_super = ALWAYS_WARN_SUPER;
13266
13267     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13268         case we need to change the emitted regop to an EXACT. */
13269     const char * orig_parse = RExC_parse;
13270     const SSize_t orig_size = RExC_size;
13271     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13272     GET_RE_DEBUG_FLAGS_DECL;
13273
13274     PERL_ARGS_ASSERT_REGCLASS;
13275 #ifndef DEBUGGING
13276     PERL_UNUSED_ARG(depth);
13277 #endif
13278
13279     DEBUG_PARSE("clas");
13280
13281     /* Assume we are going to generate an ANYOF node. */
13282     ret = reganode(pRExC_state, ANYOF, 0);
13283
13284     if (SIZE_ONLY) {
13285         RExC_size += ANYOF_SKIP;
13286         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13287     }
13288     else {
13289         ANYOF_FLAGS(ret) = 0;
13290
13291         RExC_emit += ANYOF_SKIP;
13292         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13293         initial_listsv_len = SvCUR(listsv);
13294         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13295     }
13296
13297     if (skip_white) {
13298         RExC_parse = regpatws(pRExC_state, RExC_parse,
13299                               FALSE /* means don't recognize comments */);
13300     }
13301
13302     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13303         RExC_parse++;
13304         invert = TRUE;
13305         allow_multi_folds = FALSE;
13306         RExC_naughty++;
13307         if (skip_white) {
13308             RExC_parse = regpatws(pRExC_state, RExC_parse,
13309                                   FALSE /* means don't recognize comments */);
13310         }
13311     }
13312
13313     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13314     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13315         const char *s = RExC_parse;
13316         const char  c = *s++;
13317
13318         while (isWORDCHAR(*s))
13319             s++;
13320         if (*s && c == *s && s[1] == ']') {
13321             SAVEFREESV(RExC_rx_sv);
13322             ckWARN3reg(s+2,
13323                        "POSIX syntax [%c %c] belongs inside character classes",
13324                        c, c);
13325             (void)ReREFCNT_inc(RExC_rx_sv);
13326         }
13327     }
13328
13329     /* If the caller wants us to just parse a single element, accomplish this
13330      * by faking the loop ending condition */
13331     if (stop_at_1 && RExC_end > RExC_parse) {
13332         stop_ptr = RExC_parse + 1;
13333     }
13334
13335     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13336     if (UCHARAT(RExC_parse) == ']')
13337         goto charclassloop;
13338
13339 parseit:
13340     while (1) {
13341         if  (RExC_parse >= stop_ptr) {
13342             break;
13343         }
13344
13345         if (skip_white) {
13346             RExC_parse = regpatws(pRExC_state, RExC_parse,
13347                                   FALSE /* means don't recognize comments */);
13348         }
13349
13350         if  (UCHARAT(RExC_parse) == ']') {
13351             break;
13352         }
13353
13354     charclassloop:
13355
13356         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13357         save_value = value;
13358         save_prevvalue = prevvalue;
13359
13360         if (!range) {
13361             rangebegin = RExC_parse;
13362             element_count++;
13363         }
13364         if (UTF) {
13365             value = utf8n_to_uvchr((U8*)RExC_parse,
13366                                    RExC_end - RExC_parse,
13367                                    &numlen, UTF8_ALLOW_DEFAULT);
13368             RExC_parse += numlen;
13369         }
13370         else
13371             value = UCHARAT(RExC_parse++);
13372
13373         if (value == '['
13374             && RExC_parse < RExC_end
13375             && POSIXCC(UCHARAT(RExC_parse)))
13376         {
13377             namedclass = regpposixcc(pRExC_state, value, strict);
13378         }
13379         else if (value == '\\') {
13380             if (UTF) {
13381                 value = utf8n_to_uvchr((U8*)RExC_parse,
13382                                    RExC_end - RExC_parse,
13383                                    &numlen, UTF8_ALLOW_DEFAULT);
13384                 RExC_parse += numlen;
13385             }
13386             else
13387                 value = UCHARAT(RExC_parse++);
13388
13389             /* Some compilers cannot handle switching on 64-bit integer
13390              * values, therefore value cannot be an UV.  Yes, this will
13391              * be a problem later if we want switch on Unicode.
13392              * A similar issue a little bit later when switching on
13393              * namedclass. --jhi */
13394
13395             /* If the \ is escaping white space when white space is being
13396              * skipped, it means that that white space is wanted literally, and
13397              * is already in 'value'.  Otherwise, need to translate the escape
13398              * into what it signifies. */
13399             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13400
13401             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13402             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13403             case 's':   namedclass = ANYOF_SPACE;       break;
13404             case 'S':   namedclass = ANYOF_NSPACE;      break;
13405             case 'd':   namedclass = ANYOF_DIGIT;       break;
13406             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13407             case 'v':   namedclass = ANYOF_VERTWS;      break;
13408             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13409             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13410             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13411             case 'N':  /* Handle \N{NAME} in class */
13412                 {
13413                     /* We only pay attention to the first char of
13414                     multichar strings being returned. I kinda wonder
13415                     if this makes sense as it does change the behaviour
13416                     from earlier versions, OTOH that behaviour was broken
13417                     as well. */
13418                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13419                                       TRUE, /* => charclass */
13420                                       strict))
13421                     {
13422                         if (*flagp & RESTART_UTF8)
13423                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13424                         goto parseit;
13425                     }
13426                 }
13427                 break;
13428             case 'p':
13429             case 'P':
13430                 {
13431                 char *e;
13432
13433                 /* We will handle any undefined properties ourselves */
13434                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13435                                        /* And we actually would prefer to get
13436                                         * the straight inversion list of the
13437                                         * swash, since we will be accessing it
13438                                         * anyway, to save a little time */
13439                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13440
13441                 if (RExC_parse >= RExC_end)
13442                     vFAIL2("Empty \\%c{}", (U8)value);
13443                 if (*RExC_parse == '{') {
13444                     const U8 c = (U8)value;
13445                     e = strchr(RExC_parse++, '}');
13446                     if (!e)
13447                         vFAIL2("Missing right brace on \\%c{}", c);
13448                     while (isSPACE(*RExC_parse))
13449                         RExC_parse++;
13450                     if (e == RExC_parse)
13451                         vFAIL2("Empty \\%c{}", c);
13452                     n = e - RExC_parse;
13453                     while (isSPACE(*(RExC_parse + n - 1)))
13454                         n--;
13455                 }
13456                 else {
13457                     e = RExC_parse;
13458                     n = 1;
13459                 }
13460                 if (!SIZE_ONLY) {
13461                     SV* invlist;
13462                     char* formatted;
13463                     char* name;
13464
13465                     if (UCHARAT(RExC_parse) == '^') {
13466                          RExC_parse++;
13467                          n--;
13468                          /* toggle.  (The rhs xor gets the single bit that
13469                           * differs between P and p; the other xor inverts just
13470                           * that bit) */
13471                          value ^= 'P' ^ 'p';
13472
13473                          while (isSPACE(*RExC_parse)) {
13474                               RExC_parse++;
13475                               n--;
13476                          }
13477                     }
13478                     /* Try to get the definition of the property into
13479                      * <invlist>.  If /i is in effect, the effective property
13480                      * will have its name be <__NAME_i>.  The design is
13481                      * discussed in commit
13482                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13483                     formatted = Perl_form(aTHX_
13484                                           "%s%.*s%s\n",
13485                                           (FOLD) ? "__" : "",
13486                                           (int)n,
13487                                           RExC_parse,
13488                                           (FOLD) ? "_i" : ""
13489                                 );
13490                     name = savepvn(formatted, strlen(formatted));
13491
13492                     /* Look up the property name, and get its swash and
13493                      * inversion list, if the property is found  */
13494                     if (swash) {
13495                         SvREFCNT_dec_NN(swash);
13496                     }
13497                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13498                                              1, /* binary */
13499                                              0, /* not tr/// */
13500                                              NULL, /* No inversion list */
13501                                              &swash_init_flags
13502                                             );
13503                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13504                         if (swash) {
13505                             SvREFCNT_dec_NN(swash);
13506                             swash = NULL;
13507                         }
13508
13509                         /* Here didn't find it.  It could be a user-defined
13510                          * property that will be available at run-time.  If we
13511                          * accept only compile-time properties, is an error;
13512                          * otherwise add it to the list for run-time look up */
13513                         if (ret_invlist) {
13514                             RExC_parse = e + 1;
13515                             vFAIL2utf8f(
13516                                 "Property '%"UTF8f"' is unknown",
13517                                 UTF8fARG(UTF, n, name));
13518                         }
13519                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13520                                         (value == 'p' ? '+' : '!'),
13521                                         UTF8fARG(UTF, n, name));
13522                         has_user_defined_property = TRUE;
13523
13524                         /* We don't know yet, so have to assume that the
13525                          * property could match something in the Latin1 range,
13526                          * hence something that isn't utf8.  Note that this
13527                          * would cause things in <depends_list> to match
13528                          * inappropriately, except that any \p{}, including
13529                          * this one forces Unicode semantics, which means there
13530                          * is no <depends_list> */
13531                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13532                     }
13533                     else {
13534
13535                         /* Here, did get the swash and its inversion list.  If
13536                          * the swash is from a user-defined property, then this
13537                          * whole character class should be regarded as such */
13538                         if (swash_init_flags
13539                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13540                         {
13541                             has_user_defined_property = TRUE;
13542                         }
13543                         else if
13544                             /* We warn on matching an above-Unicode code point
13545                              * if the match would return true, except don't
13546                              * warn for \p{All}, which has exactly one element
13547                              * = 0 */
13548                             (_invlist_contains_cp(invlist, 0x110000)
13549                                 && (! (_invlist_len(invlist) == 1
13550                                        && *invlist_array(invlist) == 0)))
13551                         {
13552                             warn_super = TRUE;
13553                         }
13554
13555
13556                         /* Invert if asking for the complement */
13557                         if (value == 'P') {
13558                             _invlist_union_complement_2nd(properties,
13559                                                           invlist,
13560                                                           &properties);
13561
13562                             /* The swash can't be used as-is, because we've
13563                              * inverted things; delay removing it to here after
13564                              * have copied its invlist above */
13565                             SvREFCNT_dec_NN(swash);
13566                             swash = NULL;
13567                         }
13568                         else {
13569                             _invlist_union(properties, invlist, &properties);
13570                         }
13571                     }
13572                     Safefree(name);
13573                 }
13574                 RExC_parse = e + 1;
13575                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13576                                                 named */
13577
13578                 /* \p means they want Unicode semantics */
13579                 RExC_uni_semantics = 1;
13580                 }
13581                 break;
13582             case 'n':   value = '\n';                   break;
13583             case 'r':   value = '\r';                   break;
13584             case 't':   value = '\t';                   break;
13585             case 'f':   value = '\f';                   break;
13586             case 'b':   value = '\b';                   break;
13587             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13588             case 'a':   value = '\a';                   break;
13589             case 'o':
13590                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13591                 {
13592                     const char* error_msg;
13593                     bool valid = grok_bslash_o(&RExC_parse,
13594                                                &value,
13595                                                &error_msg,
13596                                                SIZE_ONLY,   /* warnings in pass
13597                                                                1 only */
13598                                                strict,
13599                                                silence_non_portable,
13600                                                UTF);
13601                     if (! valid) {
13602                         vFAIL(error_msg);
13603                     }
13604                 }
13605                 if (PL_encoding && value < 0x100) {
13606                     goto recode_encoding;
13607                 }
13608                 break;
13609             case 'x':
13610                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13611                 {
13612                     const char* error_msg;
13613                     bool valid = grok_bslash_x(&RExC_parse,
13614                                                &value,
13615                                                &error_msg,
13616                                                TRUE, /* Output warnings */
13617                                                strict,
13618                                                silence_non_portable,
13619                                                UTF);
13620                     if (! valid) {
13621                         vFAIL(error_msg);
13622                     }
13623                 }
13624                 if (PL_encoding && value < 0x100)
13625                     goto recode_encoding;
13626                 break;
13627             case 'c':
13628                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13629                 break;
13630             case '0': case '1': case '2': case '3': case '4':
13631             case '5': case '6': case '7':
13632                 {
13633                     /* Take 1-3 octal digits */
13634                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13635                     numlen = (strict) ? 4 : 3;
13636                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13637                     RExC_parse += numlen;
13638                     if (numlen != 3) {
13639                         if (strict) {
13640                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13641                             vFAIL("Need exactly 3 octal digits");
13642                         }
13643                         else if (! SIZE_ONLY /* like \08, \178 */
13644                                  && numlen < 3
13645                                  && RExC_parse < RExC_end
13646                                  && isDIGIT(*RExC_parse)
13647                                  && ckWARN(WARN_REGEXP))
13648                         {
13649                             SAVEFREESV(RExC_rx_sv);
13650                             reg_warn_non_literal_string(
13651                                  RExC_parse + 1,
13652                                  form_short_octal_warning(RExC_parse, numlen));
13653                             (void)ReREFCNT_inc(RExC_rx_sv);
13654                         }
13655                     }
13656                     if (PL_encoding && value < 0x100)
13657                         goto recode_encoding;
13658                     break;
13659                 }
13660             recode_encoding:
13661                 if (! RExC_override_recoding) {
13662                     SV* enc = PL_encoding;
13663                     value = reg_recode((const char)(U8)value, &enc);
13664                     if (!enc) {
13665                         if (strict) {
13666                             vFAIL("Invalid escape in the specified encoding");
13667                         }
13668                         else if (SIZE_ONLY) {
13669                             ckWARNreg(RExC_parse,
13670                                   "Invalid escape in the specified encoding");
13671                         }
13672                     }
13673                     break;
13674                 }
13675             default:
13676                 /* Allow \_ to not give an error */
13677                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13678                     if (strict) {
13679                         vFAIL2("Unrecognized escape \\%c in character class",
13680                                (int)value);
13681                     }
13682                     else {
13683                         SAVEFREESV(RExC_rx_sv);
13684                         ckWARN2reg(RExC_parse,
13685                             "Unrecognized escape \\%c in character class passed through",
13686                             (int)value);
13687                         (void)ReREFCNT_inc(RExC_rx_sv);
13688                     }
13689                 }
13690                 break;
13691             }   /* End of switch on char following backslash */
13692         } /* end of handling backslash escape sequences */
13693 #ifdef EBCDIC
13694         else
13695             literal_endpoint++;
13696 #endif
13697
13698         /* Here, we have the current token in 'value' */
13699
13700         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13701             U8 classnum;
13702
13703             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13704              * literal, as is the character that began the false range, i.e.
13705              * the 'a' in the examples */
13706             if (range) {
13707                 if (!SIZE_ONLY) {
13708                     const int w = (RExC_parse >= rangebegin)
13709                                   ? RExC_parse - rangebegin
13710                                   : 0;
13711                     if (strict) {
13712                         vFAIL2utf8f(
13713                             "False [] range \"%"UTF8f"\"",
13714                             UTF8fARG(UTF, w, rangebegin));
13715                     }
13716                     else {
13717                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13718                         ckWARN2reg(RExC_parse,
13719                             "False [] range \"%"UTF8f"\"",
13720                             UTF8fARG(UTF, w, rangebegin));
13721                         (void)ReREFCNT_inc(RExC_rx_sv);
13722                         cp_list = add_cp_to_invlist(cp_list, '-');
13723                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13724                                                              prevvalue);
13725                     }
13726                 }
13727
13728                 range = 0; /* this was not a true range */
13729                 element_count += 2; /* So counts for three values */
13730             }
13731
13732             classnum = namedclass_to_classnum(namedclass);
13733
13734             if (LOC && namedclass < ANYOF_POSIXL_MAX
13735 #ifndef HAS_ISASCII
13736                 && classnum != _CC_ASCII
13737 #endif
13738             ) {
13739                 /* What the Posix classes (like \w, [:space:]) match in locale
13740                  * isn't knowable under locale until actual match time.  Room
13741                  * must be reserved (one time per outer bracketed class) to
13742                  * store such classes.  The space will contain a bit for each
13743                  * named class that is to be matched against.  This isn't
13744                  * needed for \p{} and pseudo-classes, as they are not affected
13745                  * by locale, and hence are dealt with separately */
13746                 if (! need_class) {
13747                     need_class = 1;
13748                     if (SIZE_ONLY) {
13749                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13750                     }
13751                     else {
13752                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13753                     }
13754                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13755                     ANYOF_POSIXL_ZERO(ret);
13756                 }
13757
13758                 /* See if it already matches the complement of this POSIX
13759                  * class */
13760                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13761                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13762                                                             ? -1
13763                                                             : 1)))
13764                 {
13765                     posixl_matches_all = TRUE;
13766                     break;  /* No need to continue.  Since it matches both
13767                                e.g., \w and \W, it matches everything, and the
13768                                bracketed class can be optimized into qr/./s */
13769                 }
13770
13771                 /* Add this class to those that should be checked at runtime */
13772                 ANYOF_POSIXL_SET(ret, namedclass);
13773
13774                 /* The above-Latin1 characters are not subject to locale rules.
13775                  * Just add them, in the second pass, to the
13776                  * unconditionally-matched list */
13777                 if (! SIZE_ONLY) {
13778                     SV* scratch_list = NULL;
13779
13780                     /* Get the list of the above-Latin1 code points this
13781                      * matches */
13782                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13783                                           PL_XPosix_ptrs[classnum],
13784
13785                                           /* Odd numbers are complements, like
13786                                            * NDIGIT, NASCII, ... */
13787                                           namedclass % 2 != 0,
13788                                           &scratch_list);
13789                     /* Checking if 'cp_list' is NULL first saves an extra
13790                      * clone.  Its reference count will be decremented at the
13791                      * next union, etc, or if this is the only instance, at the
13792                      * end of the routine */
13793                     if (! cp_list) {
13794                         cp_list = scratch_list;
13795                     }
13796                     else {
13797                         _invlist_union(cp_list, scratch_list, &cp_list);
13798                         SvREFCNT_dec_NN(scratch_list);
13799                     }
13800                     continue;   /* Go get next character */
13801                 }
13802             }
13803             else if (! SIZE_ONLY) {
13804
13805                 /* Here, not in pass1 (in that pass we skip calculating the
13806                  * contents of this class), and is /l, or is a POSIX class for
13807                  * which /l doesn't matter (or is a Unicode property, which is
13808                  * skipped here). */
13809                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13810                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13811
13812                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13813                          * nor /l make a difference in what these match,
13814                          * therefore we just add what they match to cp_list. */
13815                         if (classnum != _CC_VERTSPACE) {
13816                             assert(   namedclass == ANYOF_HORIZWS
13817                                    || namedclass == ANYOF_NHORIZWS);
13818
13819                             /* It turns out that \h is just a synonym for
13820                              * XPosixBlank */
13821                             classnum = _CC_BLANK;
13822                         }
13823
13824                         _invlist_union_maybe_complement_2nd(
13825                                 cp_list,
13826                                 PL_XPosix_ptrs[classnum],
13827                                 namedclass % 2 != 0,    /* Complement if odd
13828                                                           (NHORIZWS, NVERTWS)
13829                                                         */
13830                                 &cp_list);
13831                     }
13832                 }
13833                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13834                            complement and use nposixes */
13835                     SV** posixes_ptr = namedclass % 2 == 0
13836                                        ? &posixes
13837                                        : &nposixes;
13838                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13839                     _invlist_union_maybe_complement_2nd(
13840                                                      *posixes_ptr,
13841                                                      *source_ptr,
13842                                                      namedclass % 2 != 0,
13843                                                      posixes_ptr);
13844                 }
13845                 continue;   /* Go get next character */
13846             }
13847         } /* end of namedclass \blah */
13848
13849         /* Here, we have a single value.  If 'range' is set, it is the ending
13850          * of a range--check its validity.  Later, we will handle each
13851          * individual code point in the range.  If 'range' isn't set, this
13852          * could be the beginning of a range, so check for that by looking
13853          * ahead to see if the next real character to be processed is the range
13854          * indicator--the minus sign */
13855
13856         if (skip_white) {
13857             RExC_parse = regpatws(pRExC_state, RExC_parse,
13858                                 FALSE /* means don't recognize comments */);
13859         }
13860
13861         if (range) {
13862             if (prevvalue > value) /* b-a */ {
13863                 const int w = RExC_parse - rangebegin;
13864                 vFAIL2utf8f(
13865                     "Invalid [] range \"%"UTF8f"\"",
13866                     UTF8fARG(UTF, w, rangebegin));
13867                 range = 0; /* not a valid range */
13868             }
13869         }
13870         else {
13871             prevvalue = value; /* save the beginning of the potential range */
13872             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13873                 && *RExC_parse == '-')
13874             {
13875                 char* next_char_ptr = RExC_parse + 1;
13876                 if (skip_white) {   /* Get the next real char after the '-' */
13877                     next_char_ptr = regpatws(pRExC_state,
13878                                              RExC_parse + 1,
13879                                              FALSE); /* means don't recognize
13880                                                         comments */
13881                 }
13882
13883                 /* If the '-' is at the end of the class (just before the ']',
13884                  * it is a literal minus; otherwise it is a range */
13885                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13886                     RExC_parse = next_char_ptr;
13887
13888                     /* a bad range like \w-, [:word:]- ? */
13889                     if (namedclass > OOB_NAMEDCLASS) {
13890                         if (strict || ckWARN(WARN_REGEXP)) {
13891                             const int w =
13892                                 RExC_parse >= rangebegin ?
13893                                 RExC_parse - rangebegin : 0;
13894                             if (strict) {
13895                                 vFAIL4("False [] range \"%*.*s\"",
13896                                     w, w, rangebegin);
13897                             }
13898                             else {
13899                                 vWARN4(RExC_parse,
13900                                     "False [] range \"%*.*s\"",
13901                                     w, w, rangebegin);
13902                             }
13903                         }
13904                         if (!SIZE_ONLY) {
13905                             cp_list = add_cp_to_invlist(cp_list, '-');
13906                         }
13907                         element_count++;
13908                     } else
13909                         range = 1;      /* yeah, it's a range! */
13910                     continue;   /* but do it the next time */
13911                 }
13912             }
13913         }
13914
13915         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13916          * if not */
13917
13918         /* non-Latin1 code point implies unicode semantics.  Must be set in
13919          * pass1 so is there for the whole of pass 2 */
13920         if (value > 255) {
13921             RExC_uni_semantics = 1;
13922         }
13923
13924         /* Ready to process either the single value, or the completed range.
13925          * For single-valued non-inverted ranges, we consider the possibility
13926          * of multi-char folds.  (We made a conscious decision to not do this
13927          * for the other cases because it can often lead to non-intuitive
13928          * results.  For example, you have the peculiar case that:
13929          *  "s s" =~ /^[^\xDF]+$/i => Y
13930          *  "ss"  =~ /^[^\xDF]+$/i => N
13931          *
13932          * See [perl #89750] */
13933         if (FOLD && allow_multi_folds && value == prevvalue) {
13934             if (value == LATIN_SMALL_LETTER_SHARP_S
13935                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13936                                                         value)))
13937             {
13938                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13939
13940                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13941                 STRLEN foldlen;
13942
13943                 UV folded = _to_uni_fold_flags(
13944                                 value,
13945                                 foldbuf,
13946                                 &foldlen,
13947                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13948                                                    ? FOLD_FLAGS_NOMIX_ASCII
13949                                                    : 0)
13950                                 );
13951
13952                 /* Here, <folded> should be the first character of the
13953                  * multi-char fold of <value>, with <foldbuf> containing the
13954                  * whole thing.  But, if this fold is not allowed (because of
13955                  * the flags), <fold> will be the same as <value>, and should
13956                  * be processed like any other character, so skip the special
13957                  * handling */
13958                 if (folded != value) {
13959
13960                     /* Skip if we are recursed, currently parsing the class
13961                      * again.  Otherwise add this character to the list of
13962                      * multi-char folds. */
13963                     if (! RExC_in_multi_char_class) {
13964                         AV** this_array_ptr;
13965                         AV* this_array;
13966                         STRLEN cp_count = utf8_length(foldbuf,
13967                                                       foldbuf + foldlen);
13968                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13969
13970                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13971
13972
13973                         if (! multi_char_matches) {
13974                             multi_char_matches = newAV();
13975                         }
13976
13977                         /* <multi_char_matches> is actually an array of arrays.
13978                          * There will be one or two top-level elements: [2],
13979                          * and/or [3].  The [2] element is an array, each
13980                          * element thereof is a character which folds to TWO
13981                          * characters; [3] is for folds to THREE characters.
13982                          * (Unicode guarantees a maximum of 3 characters in any
13983                          * fold.)  When we rewrite the character class below,
13984                          * we will do so such that the longest folds are
13985                          * written first, so that it prefers the longest
13986                          * matching strings first.  This is done even if it
13987                          * turns out that any quantifier is non-greedy, out of
13988                          * programmer laziness.  Tom Christiansen has agreed
13989                          * that this is ok.  This makes the test for the
13990                          * ligature 'ffi' come before the test for 'ff' */
13991                         if (av_exists(multi_char_matches, cp_count)) {
13992                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13993                                                              cp_count, FALSE);
13994                             this_array = *this_array_ptr;
13995                         }
13996                         else {
13997                             this_array = newAV();
13998                             av_store(multi_char_matches, cp_count,
13999                                      (SV*) this_array);
14000                         }
14001                         av_push(this_array, multi_fold);
14002                     }
14003
14004                     /* This element should not be processed further in this
14005                      * class */
14006                     element_count--;
14007                     value = save_value;
14008                     prevvalue = save_prevvalue;
14009                     continue;
14010                 }
14011             }
14012         }
14013
14014         /* Deal with this element of the class */
14015         if (! SIZE_ONLY) {
14016 #ifndef EBCDIC
14017             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14018                                                      prevvalue, value);
14019 #else
14020             SV* this_range = _new_invlist(1);
14021             _append_range_to_invlist(this_range, prevvalue, value);
14022
14023             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14024              * If this range was specified using something like 'i-j', we want
14025              * to include only the 'i' and the 'j', and not anything in
14026              * between, so exclude non-ASCII, non-alphabetics from it.
14027              * However, if the range was specified with something like
14028              * [\x89-\x91] or [\x89-j], all code points within it should be
14029              * included.  literal_endpoint==2 means both ends of the range used
14030              * a literal character, not \x{foo} */
14031             if (literal_endpoint == 2
14032                 && ((prevvalue >= 'a' && value <= 'z')
14033                     || (prevvalue >= 'A' && value <= 'Z')))
14034             {
14035                 _invlist_intersection(this_range, PL_ASCII,
14036                                       &this_range);
14037
14038                 /* Since this above only contains ascii, the intersection of it
14039                  * with anything will still yield only ascii */
14040                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14041                                       &this_range);
14042             }
14043             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14044             literal_endpoint = 0;
14045 #endif
14046         }
14047
14048         range = 0; /* this range (if it was one) is done now */
14049     } /* End of loop through all the text within the brackets */
14050
14051     /* If anything in the class expands to more than one character, we have to
14052      * deal with them by building up a substitute parse string, and recursively
14053      * calling reg() on it, instead of proceeding */
14054     if (multi_char_matches) {
14055         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14056         I32 cp_count;
14057         STRLEN len;
14058         char *save_end = RExC_end;
14059         char *save_parse = RExC_parse;
14060         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14061                                        a "|" */
14062         I32 reg_flags;
14063
14064         assert(! invert);
14065 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14066            because too confusing */
14067         if (invert) {
14068             sv_catpv(substitute_parse, "(?:");
14069         }
14070 #endif
14071
14072         /* Look at the longest folds first */
14073         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14074
14075             if (av_exists(multi_char_matches, cp_count)) {
14076                 AV** this_array_ptr;
14077                 SV* this_sequence;
14078
14079                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14080                                                  cp_count, FALSE);
14081                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14082                                                                 &PL_sv_undef)
14083                 {
14084                     if (! first_time) {
14085                         sv_catpv(substitute_parse, "|");
14086                     }
14087                     first_time = FALSE;
14088
14089                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14090                 }
14091             }
14092         }
14093
14094         /* If the character class contains anything else besides these
14095          * multi-character folds, have to include it in recursive parsing */
14096         if (element_count) {
14097             sv_catpv(substitute_parse, "|[");
14098             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14099             sv_catpv(substitute_parse, "]");
14100         }
14101
14102         sv_catpv(substitute_parse, ")");
14103 #if 0
14104         if (invert) {
14105             /* This is a way to get the parse to skip forward a whole named
14106              * sequence instead of matching the 2nd character when it fails the
14107              * first */
14108             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14109         }
14110 #endif
14111
14112         RExC_parse = SvPV(substitute_parse, len);
14113         RExC_end = RExC_parse + len;
14114         RExC_in_multi_char_class = 1;
14115         RExC_emit = (regnode *)orig_emit;
14116
14117         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14118
14119         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14120
14121         RExC_parse = save_parse;
14122         RExC_end = save_end;
14123         RExC_in_multi_char_class = 0;
14124         SvREFCNT_dec_NN(multi_char_matches);
14125         return ret;
14126     }
14127
14128     /* Here, we've gone through the entire class and dealt with multi-char
14129      * folds.  We are now in a position that we can do some checks to see if we
14130      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14131      * Currently we only do two checks:
14132      * 1) is in the unlikely event that the user has specified both, eg. \w and
14133      *    \W under /l, then the class matches everything.  (This optimization
14134      *    is done only to make the optimizer code run later work.)
14135      * 2) if the character class contains only a single element (including a
14136      *    single range), we see if there is an equivalent node for it.
14137      * Other checks are possible */
14138     if (! ret_invlist   /* Can't optimize if returning the constructed
14139                            inversion list */
14140         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14141     {
14142         U8 op = END;
14143         U8 arg = 0;
14144
14145         if (UNLIKELY(posixl_matches_all)) {
14146             op = SANY;
14147         }
14148         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14149                                                    \w or [:digit:] or \p{foo}
14150                                                  */
14151
14152             /* All named classes are mapped into POSIXish nodes, with its FLAG
14153              * argument giving which class it is */
14154             switch ((I32)namedclass) {
14155                 case ANYOF_UNIPROP:
14156                     break;
14157
14158                 /* These don't depend on the charset modifiers.  They always
14159                  * match under /u rules */
14160                 case ANYOF_NHORIZWS:
14161                 case ANYOF_HORIZWS:
14162                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14163                     /* FALLTHROUGH */
14164
14165                 case ANYOF_NVERTWS:
14166                 case ANYOF_VERTWS:
14167                     op = POSIXU;
14168                     goto join_posix;
14169
14170                 /* The actual POSIXish node for all the rest depends on the
14171                  * charset modifier.  The ones in the first set depend only on
14172                  * ASCII or, if available on this platform, locale */
14173                 case ANYOF_ASCII:
14174                 case ANYOF_NASCII:
14175 #ifdef HAS_ISASCII
14176                     op = (LOC) ? POSIXL : POSIXA;
14177 #else
14178                     op = POSIXA;
14179 #endif
14180                     goto join_posix;
14181
14182                 case ANYOF_NCASED:
14183                 case ANYOF_LOWER:
14184                 case ANYOF_NLOWER:
14185                 case ANYOF_UPPER:
14186                 case ANYOF_NUPPER:
14187                     /* under /a could be alpha */
14188                     if (FOLD) {
14189                         if (ASCII_RESTRICTED) {
14190                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14191                         }
14192                         else if (! LOC) {
14193                             break;
14194                         }
14195                     }
14196                     /* FALLTHROUGH */
14197
14198                 /* The rest have more possibilities depending on the charset.
14199                  * We take advantage of the enum ordering of the charset
14200                  * modifiers to get the exact node type, */
14201                 default:
14202                     op = POSIXD + get_regex_charset(RExC_flags);
14203                     if (op > POSIXA) { /* /aa is same as /a */
14204                         op = POSIXA;
14205                     }
14206
14207                 join_posix:
14208                     /* The odd numbered ones are the complements of the
14209                      * next-lower even number one */
14210                     if (namedclass % 2 == 1) {
14211                         invert = ! invert;
14212                         namedclass--;
14213                     }
14214                     arg = namedclass_to_classnum(namedclass);
14215                     break;
14216             }
14217         }
14218         else if (value == prevvalue) {
14219
14220             /* Here, the class consists of just a single code point */
14221
14222             if (invert) {
14223                 if (! LOC && value == '\n') {
14224                     op = REG_ANY; /* Optimize [^\n] */
14225                     *flagp |= HASWIDTH|SIMPLE;
14226                     RExC_naughty++;
14227                 }
14228             }
14229             else if (value < 256 || UTF) {
14230
14231                 /* Optimize a single value into an EXACTish node, but not if it
14232                  * would require converting the pattern to UTF-8. */
14233                 op = compute_EXACTish(pRExC_state);
14234             }
14235         } /* Otherwise is a range */
14236         else if (! LOC) {   /* locale could vary these */
14237             if (prevvalue == '0') {
14238                 if (value == '9') {
14239                     arg = _CC_DIGIT;
14240                     op = POSIXA;
14241                 }
14242             }
14243         }
14244
14245         /* Here, we have changed <op> away from its initial value iff we found
14246          * an optimization */
14247         if (op != END) {
14248
14249             /* Throw away this ANYOF regnode, and emit the calculated one,
14250              * which should correspond to the beginning, not current, state of
14251              * the parse */
14252             const char * cur_parse = RExC_parse;
14253             RExC_parse = (char *)orig_parse;
14254             if ( SIZE_ONLY) {
14255                 if (! LOC) {
14256
14257                     /* To get locale nodes to not use the full ANYOF size would
14258                      * require moving the code above that writes the portions
14259                      * of it that aren't in other nodes to after this point.
14260                      * e.g.  ANYOF_POSIXL_SET */
14261                     RExC_size = orig_size;
14262                 }
14263             }
14264             else {
14265                 RExC_emit = (regnode *)orig_emit;
14266                 if (PL_regkind[op] == POSIXD) {
14267                     if (op == POSIXL) {
14268                         RExC_contains_locale = 1;
14269                     }
14270                     if (invert) {
14271                         op += NPOSIXD - POSIXD;
14272                     }
14273                 }
14274             }
14275
14276             ret = reg_node(pRExC_state, op);
14277
14278             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14279                 if (! SIZE_ONLY) {
14280                     FLAGS(ret) = arg;
14281                 }
14282                 *flagp |= HASWIDTH|SIMPLE;
14283             }
14284             else if (PL_regkind[op] == EXACT) {
14285                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14286                                            TRUE /* downgradable to EXACT */
14287                                            );
14288             }
14289
14290             RExC_parse = (char *) cur_parse;
14291
14292             SvREFCNT_dec(posixes);
14293             SvREFCNT_dec(nposixes);
14294             SvREFCNT_dec(cp_list);
14295             SvREFCNT_dec(cp_foldable_list);
14296             return ret;
14297         }
14298     }
14299
14300     if (SIZE_ONLY)
14301         return ret;
14302     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14303
14304     /* If folding, we calculate all characters that could fold to or from the
14305      * ones already on the list */
14306     if (cp_foldable_list) {
14307         if (FOLD) {
14308             UV start, end;      /* End points of code point ranges */
14309
14310             SV* fold_intersection = NULL;
14311             SV** use_list;
14312
14313             /* Our calculated list will be for Unicode rules.  For locale
14314              * matching, we have to keep a separate list that is consulted at
14315              * runtime only when the locale indicates Unicode rules.  For
14316              * non-locale, we just use to the general list */
14317             if (LOC) {
14318                 use_list = &only_utf8_locale_list;
14319             }
14320             else {
14321                 use_list = &cp_list;
14322             }
14323
14324             /* Only the characters in this class that participate in folds need
14325              * be checked.  Get the intersection of this class and all the
14326              * possible characters that are foldable.  This can quickly narrow
14327              * down a large class */
14328             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14329                                   &fold_intersection);
14330
14331             /* The folds for all the Latin1 characters are hard-coded into this
14332              * program, but we have to go out to disk to get the others. */
14333             if (invlist_highest(cp_foldable_list) >= 256) {
14334
14335                 /* This is a hash that for a particular fold gives all
14336                  * characters that are involved in it */
14337                 if (! PL_utf8_foldclosures) {
14338
14339                     /* If the folds haven't been read in, call a fold function
14340                      * to force that */
14341                     if (! PL_utf8_tofold) {
14342                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14343
14344                         /* This string is just a short named one above \xff */
14345                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14346                         assert(PL_utf8_tofold); /* Verify that worked */
14347                     }
14348                     PL_utf8_foldclosures
14349                                       = _swash_inversion_hash(PL_utf8_tofold);
14350                 }
14351             }
14352
14353             /* Now look at the foldable characters in this class individually */
14354             invlist_iterinit(fold_intersection);
14355             while (invlist_iternext(fold_intersection, &start, &end)) {
14356                 UV j;
14357
14358                 /* Look at every character in the range */
14359                 for (j = start; j <= end; j++) {
14360                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14361                     STRLEN foldlen;
14362                     SV** listp;
14363
14364                     if (j < 256) {
14365
14366                         /* We have the latin1 folding rules hard-coded here so
14367                          * that an innocent-looking character class, like
14368                          * /[ks]/i won't have to go out to disk to find the
14369                          * possible matches.  XXX It would be better to
14370                          * generate these via regen, in case a new version of
14371                          * the Unicode standard adds new mappings, though that
14372                          * is not really likely, and may be caught by the
14373                          * default: case of the switch below. */
14374
14375                         if (IS_IN_SOME_FOLD_L1(j)) {
14376
14377                             /* ASCII is always matched; non-ASCII is matched
14378                              * only under Unicode rules (which could happen
14379                              * under /l if the locale is a UTF-8 one */
14380                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14381                                 *use_list = add_cp_to_invlist(*use_list,
14382                                                             PL_fold_latin1[j]);
14383                             }
14384                             else {
14385                                 depends_list =
14386                                  add_cp_to_invlist(depends_list,
14387                                                    PL_fold_latin1[j]);
14388                             }
14389                         }
14390
14391                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14392                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14393                         {
14394                             /* Certain Latin1 characters have matches outside
14395                             * Latin1.  To get here, <j> is one of those
14396                             * characters.   None of these matches is valid for
14397                             * ASCII characters under /aa, which is why the 'if'
14398                             * just above excludes those.  These matches only
14399                             * happen when the target string is utf8.  The code
14400                             * below adds the single fold closures for <j> to the
14401                             * inversion list. */
14402
14403                             switch (j) {
14404                                 case 'k':
14405                                 case 'K':
14406                                   *use_list =
14407                                      add_cp_to_invlist(*use_list, KELVIN_SIGN);
14408                                     break;
14409                                 case 's':
14410                                 case 'S':
14411                                   *use_list = add_cp_to_invlist(*use_list,
14412                                                     LATIN_SMALL_LETTER_LONG_S);
14413                                     break;
14414                                 case MICRO_SIGN:
14415                                   *use_list = add_cp_to_invlist(*use_list,
14416                                                       GREEK_CAPITAL_LETTER_MU);
14417                                   *use_list = add_cp_to_invlist(*use_list,
14418                                                         GREEK_SMALL_LETTER_MU);
14419                                     break;
14420                                 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14421                                 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14422                                   *use_list =
14423                                    add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14424                                     break;
14425                                 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14426                                   *use_list = add_cp_to_invlist(*use_list,
14427                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14428                                     break;
14429                                 case LATIN_SMALL_LETTER_SHARP_S:
14430                                   *use_list = add_cp_to_invlist(*use_list,
14431                                                  LATIN_CAPITAL_LETTER_SHARP_S);
14432                                     break;
14433                                 case 'F': case 'f':
14434                                 case 'I': case 'i':
14435                                 case 'L': case 'l':
14436                                 case 'T': case 't':
14437                                 case 'A': case 'a':
14438                                 case 'H': case 'h':
14439                                 case 'J': case 'j':
14440                                 case 'N': case 'n':
14441                                 case 'W': case 'w':
14442                                 case 'Y': case 'y':
14443                                     /* These all are targets of multi-character
14444                                      * folds from code points that require UTF8
14445                                      * to express, so they can't match unless
14446                                      * the target string is in UTF-8, so no
14447                                      * action here is necessary, as regexec.c
14448                                      * properly handles the general case for
14449                                      * UTF-8 matching and multi-char folds */
14450                                     break;
14451                                 default:
14452                                     /* Use deprecated warning to increase the
14453                                     * chances of this being output */
14454                                     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14455                                     break;
14456                             }
14457                         }
14458                         continue;
14459                     }
14460
14461                     /* Here is an above Latin1 character.  We don't have the
14462                      * rules hard-coded for it.  First, get its fold.  This is
14463                      * the simple fold, as the multi-character folds have been
14464                      * handled earlier and separated out */
14465                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14466                                                         (ASCII_FOLD_RESTRICTED)
14467                                                         ? FOLD_FLAGS_NOMIX_ASCII
14468                                                         : 0);
14469
14470                     /* Single character fold of above Latin1.  Add everything in
14471                     * its fold closure to the list that this node should match.
14472                     * The fold closures data structure is a hash with the keys
14473                     * being the UTF-8 of every character that is folded to, like
14474                     * 'k', and the values each an array of all code points that
14475                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14476                     * Multi-character folds are not included */
14477                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14478                                         (char *) foldbuf, foldlen, FALSE)))
14479                     {
14480                         AV* list = (AV*) *listp;
14481                         IV k;
14482                         for (k = 0; k <= av_tindex(list); k++) {
14483                             SV** c_p = av_fetch(list, k, FALSE);
14484                             UV c;
14485                             if (c_p == NULL) {
14486                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14487                             }
14488                             c = SvUV(*c_p);
14489
14490                             /* /aa doesn't allow folds between ASCII and non- */
14491                             if ((ASCII_FOLD_RESTRICTED
14492                                 && (isASCII(c) != isASCII(j))))
14493                             {
14494                                 continue;
14495                             }
14496
14497                             /* Folds under /l which cross the 255/256 boundary
14498                              * are added to a separate list.  (These are valid
14499                              * only when the locale is UTF-8.) */
14500                             if (c < 256 && LOC) {
14501                                 *use_list = add_cp_to_invlist(*use_list, c);
14502                                 continue;
14503                             }
14504
14505                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14506                             {
14507                                 cp_list = add_cp_to_invlist(cp_list, c);
14508                             }
14509                             else {
14510                                 /* Similarly folds involving non-ascii Latin1
14511                                 * characters under /d are added to their list */
14512                                 depends_list = add_cp_to_invlist(depends_list,
14513                                                                  c);
14514                             }
14515                         }
14516                     }
14517                 }
14518             }
14519             SvREFCNT_dec_NN(fold_intersection);
14520         }
14521
14522         /* Now that we have finished adding all the folds, there is no reason
14523          * to keep the foldable list separate */
14524         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14525         SvREFCNT_dec_NN(cp_foldable_list);
14526     }
14527
14528     /* And combine the result (if any) with any inversion list from posix
14529      * classes.  The lists are kept separate up to now because we don't want to
14530      * fold the classes (folding of those is automatically handled by the swash
14531      * fetching code) */
14532     if (posixes || nposixes) {
14533         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14534             /* Under /a and /aa, nothing above ASCII matches these */
14535             _invlist_intersection(posixes,
14536                                   PL_XPosix_ptrs[_CC_ASCII],
14537                                   &posixes);
14538         }
14539         if (nposixes) {
14540             if (DEPENDS_SEMANTICS) {
14541                 /* Under /d, everything in the upper half of the Latin1 range
14542                  * matches these complements */
14543                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14544             }
14545             else if (AT_LEAST_ASCII_RESTRICTED) {
14546                 /* Under /a and /aa, everything above ASCII matches these
14547                  * complements */
14548                 _invlist_union_complement_2nd(nposixes,
14549                                               PL_XPosix_ptrs[_CC_ASCII],
14550                                               &nposixes);
14551             }
14552             if (posixes) {
14553                 _invlist_union(posixes, nposixes, &posixes);
14554                 SvREFCNT_dec_NN(nposixes);
14555             }
14556             else {
14557                 posixes = nposixes;
14558             }
14559         }
14560         if (! DEPENDS_SEMANTICS) {
14561             if (cp_list) {
14562                 _invlist_union(cp_list, posixes, &cp_list);
14563                 SvREFCNT_dec_NN(posixes);
14564             }
14565             else {
14566                 cp_list = posixes;
14567             }
14568         }
14569         else {
14570             /* Under /d, we put into a separate list the Latin1 things that
14571              * match only when the target string is utf8 */
14572             SV* nonascii_but_latin1_properties = NULL;
14573             _invlist_intersection(posixes, PL_UpperLatin1,
14574                                   &nonascii_but_latin1_properties);
14575             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14576                               &posixes);
14577             if (cp_list) {
14578                 _invlist_union(cp_list, posixes, &cp_list);
14579                 SvREFCNT_dec_NN(posixes);
14580             }
14581             else {
14582                 cp_list = posixes;
14583             }
14584
14585             if (depends_list) {
14586                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14587                                &depends_list);
14588                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14589             }
14590             else {
14591                 depends_list = nonascii_but_latin1_properties;
14592             }
14593         }
14594     }
14595
14596     /* And combine the result (if any) with any inversion list from properties.
14597      * The lists are kept separate up to now so that we can distinguish the two
14598      * in regards to matching above-Unicode.  A run-time warning is generated
14599      * if a Unicode property is matched against a non-Unicode code point. But,
14600      * we allow user-defined properties to match anything, without any warning,
14601      * and we also suppress the warning if there is a portion of the character
14602      * class that isn't a Unicode property, and which matches above Unicode, \W
14603      * or [\x{110000}] for example.
14604      * (Note that in this case, unlike the Posix one above, there is no
14605      * <depends_list>, because having a Unicode property forces Unicode
14606      * semantics */
14607     if (properties) {
14608         if (cp_list) {
14609
14610             /* If it matters to the final outcome, see if a non-property
14611              * component of the class matches above Unicode.  If so, the
14612              * warning gets suppressed.  This is true even if just a single
14613              * such code point is specified, as though not strictly correct if
14614              * another such code point is matched against, the fact that they
14615              * are using above-Unicode code points indicates they should know
14616              * the issues involved */
14617             if (warn_super) {
14618                 warn_super = ! (invert
14619                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14620             }
14621
14622             _invlist_union(properties, cp_list, &cp_list);
14623             SvREFCNT_dec_NN(properties);
14624         }
14625         else {
14626             cp_list = properties;
14627         }
14628
14629         if (warn_super) {
14630             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14631         }
14632     }
14633
14634     /* Here, we have calculated what code points should be in the character
14635      * class.
14636      *
14637      * Now we can see about various optimizations.  Fold calculation (which we
14638      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14639      * would invert to include K, which under /i would match k, which it
14640      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14641      * folded until runtime */
14642
14643     /* If we didn't do folding, it's because some information isn't available
14644      * until runtime; set the run-time fold flag for these.  (We don't have to
14645      * worry about properties folding, as that is taken care of by the swash
14646      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14647      * locales, or the class matches at least one 0-255 range code point */
14648     if (LOC && FOLD) {
14649         if (only_utf8_locale_list) {
14650             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14651         }
14652         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14653                                the list */
14654             UV start, end;
14655             invlist_iterinit(cp_list);
14656             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14657                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14658             }
14659             invlist_iterfinish(cp_list);
14660         }
14661     }
14662
14663     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14664      * at compile time.  Besides not inverting folded locale now, we can't
14665      * invert if there are things such as \w, which aren't known until runtime
14666      * */
14667     if (cp_list
14668         && invert
14669         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14670         && ! depends_list
14671         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14672     {
14673         _invlist_invert(cp_list);
14674
14675         /* Any swash can't be used as-is, because we've inverted things */
14676         if (swash) {
14677             SvREFCNT_dec_NN(swash);
14678             swash = NULL;
14679         }
14680
14681         /* Clear the invert flag since have just done it here */
14682         invert = FALSE;
14683     }
14684
14685     if (ret_invlist) {
14686         *ret_invlist = cp_list;
14687         SvREFCNT_dec(swash);
14688
14689         /* Discard the generated node */
14690         if (SIZE_ONLY) {
14691             RExC_size = orig_size;
14692         }
14693         else {
14694             RExC_emit = orig_emit;
14695         }
14696         return orig_emit;
14697     }
14698
14699     /* Some character classes are equivalent to other nodes.  Such nodes take
14700      * up less room and generally fewer operations to execute than ANYOF nodes.
14701      * Above, we checked for and optimized into some such equivalents for
14702      * certain common classes that are easy to test.  Getting to this point in
14703      * the code means that the class didn't get optimized there.  Since this
14704      * code is only executed in Pass 2, it is too late to save space--it has
14705      * been allocated in Pass 1, and currently isn't given back.  But turning
14706      * things into an EXACTish node can allow the optimizer to join it to any
14707      * adjacent such nodes.  And if the class is equivalent to things like /./,
14708      * expensive run-time swashes can be avoided.  Now that we have more
14709      * complete information, we can find things necessarily missed by the
14710      * earlier code.  I (khw) am not sure how much to look for here.  It would
14711      * be easy, but perhaps too slow, to check any candidates against all the
14712      * node types they could possibly match using _invlistEQ(). */
14713
14714     if (cp_list
14715         && ! invert
14716         && ! depends_list
14717         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14718         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14719
14720            /* We don't optimize if we are supposed to make sure all non-Unicode
14721             * code points raise a warning, as only ANYOF nodes have this check.
14722             * */
14723         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14724     {
14725         UV start, end;
14726         U8 op = END;  /* The optimzation node-type */
14727         const char * cur_parse= RExC_parse;
14728
14729         invlist_iterinit(cp_list);
14730         if (! invlist_iternext(cp_list, &start, &end)) {
14731
14732             /* Here, the list is empty.  This happens, for example, when a
14733              * Unicode property is the only thing in the character class, and
14734              * it doesn't match anything.  (perluniprops.pod notes such
14735              * properties) */
14736             op = OPFAIL;
14737             *flagp |= HASWIDTH|SIMPLE;
14738         }
14739         else if (start == end) {    /* The range is a single code point */
14740             if (! invlist_iternext(cp_list, &start, &end)
14741
14742                     /* Don't do this optimization if it would require changing
14743                      * the pattern to UTF-8 */
14744                 && (start < 256 || UTF))
14745             {
14746                 /* Here, the list contains a single code point.  Can optimize
14747                  * into an EXACTish node */
14748
14749                 value = start;
14750
14751                 if (! FOLD) {
14752                     op = EXACT;
14753                 }
14754                 else if (LOC) {
14755
14756                     /* A locale node under folding with one code point can be
14757                      * an EXACTFL, as its fold won't be calculated until
14758                      * runtime */
14759                     op = EXACTFL;
14760                 }
14761                 else {
14762
14763                     /* Here, we are generally folding, but there is only one
14764                      * code point to match.  If we have to, we use an EXACT
14765                      * node, but it would be better for joining with adjacent
14766                      * nodes in the optimization pass if we used the same
14767                      * EXACTFish node that any such are likely to be.  We can
14768                      * do this iff the code point doesn't participate in any
14769                      * folds.  For example, an EXACTF of a colon is the same as
14770                      * an EXACT one, since nothing folds to or from a colon. */
14771                     if (value < 256) {
14772                         if (IS_IN_SOME_FOLD_L1(value)) {
14773                             op = EXACT;
14774                         }
14775                     }
14776                     else {
14777                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14778                             op = EXACT;
14779                         }
14780                     }
14781
14782                     /* If we haven't found the node type, above, it means we
14783                      * can use the prevailing one */
14784                     if (op == END) {
14785                         op = compute_EXACTish(pRExC_state);
14786                     }
14787                 }
14788             }
14789         }
14790         else if (start == 0) {
14791             if (end == UV_MAX) {
14792                 op = SANY;
14793                 *flagp |= HASWIDTH|SIMPLE;
14794                 RExC_naughty++;
14795             }
14796             else if (end == '\n' - 1
14797                     && invlist_iternext(cp_list, &start, &end)
14798                     && start == '\n' + 1 && end == UV_MAX)
14799             {
14800                 op = REG_ANY;
14801                 *flagp |= HASWIDTH|SIMPLE;
14802                 RExC_naughty++;
14803             }
14804         }
14805         invlist_iterfinish(cp_list);
14806
14807         if (op != END) {
14808             RExC_parse = (char *)orig_parse;
14809             RExC_emit = (regnode *)orig_emit;
14810
14811             ret = reg_node(pRExC_state, op);
14812
14813             RExC_parse = (char *)cur_parse;
14814
14815             if (PL_regkind[op] == EXACT) {
14816                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14817                                            TRUE /* downgradable to EXACT */
14818                                           );
14819             }
14820
14821             SvREFCNT_dec_NN(cp_list);
14822             return ret;
14823         }
14824     }
14825
14826     /* Here, <cp_list> contains all the code points we can determine at
14827      * compile time that match under all conditions.  Go through it, and
14828      * for things that belong in the bitmap, put them there, and delete from
14829      * <cp_list>.  While we are at it, see if everything above 255 is in the
14830      * list, and if so, set a flag to speed up execution */
14831
14832     populate_ANYOF_from_invlist(ret, &cp_list);
14833
14834     if (invert) {
14835         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14836     }
14837
14838     /* Here, the bitmap has been populated with all the Latin1 code points that
14839      * always match.  Can now add to the overall list those that match only
14840      * when the target string is UTF-8 (<depends_list>). */
14841     if (depends_list) {
14842         if (cp_list) {
14843             _invlist_union(cp_list, depends_list, &cp_list);
14844             SvREFCNT_dec_NN(depends_list);
14845         }
14846         else {
14847             cp_list = depends_list;
14848         }
14849         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14850     }
14851
14852     /* If there is a swash and more than one element, we can't use the swash in
14853      * the optimization below. */
14854     if (swash && element_count > 1) {
14855         SvREFCNT_dec_NN(swash);
14856         swash = NULL;
14857     }
14858
14859     set_ANYOF_arg(pRExC_state, ret, cp_list,
14860                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14861                    ? listsv : NULL,
14862                   only_utf8_locale_list,
14863                   swash, has_user_defined_property);
14864
14865     *flagp |= HASWIDTH|SIMPLE;
14866
14867     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14868         RExC_contains_locale = 1;
14869     }
14870
14871     return ret;
14872 }
14873
14874 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14875
14876 STATIC void
14877 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14878                 regnode* const node,
14879                 SV* const cp_list,
14880                 SV* const runtime_defns,
14881                 SV* const only_utf8_locale_list,
14882                 SV* const swash,
14883                 const bool has_user_defined_property)
14884 {
14885     /* Sets the arg field of an ANYOF-type node 'node', using information about
14886      * the node passed-in.  If there is nothing outside the node's bitmap, the
14887      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14888      * the count returned by add_data(), having allocated and stored an array,
14889      * av, that that count references, as follows:
14890      *  av[0] stores the character class description in its textual form.
14891      *        This is used later (regexec.c:Perl_regclass_swash()) to
14892      *        initialize the appropriate swash, and is also useful for dumping
14893      *        the regnode.  This is set to &PL_sv_undef if the textual
14894      *        description is not needed at run-time (as happens if the other
14895      *        elements completely define the class)
14896      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14897      *        computed from av[0].  But if no further computation need be done,
14898      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14899      *  av[2] stores the inversion list of code points that match only if the
14900      *        current locale is UTF-8
14901      *  av[3] stores the cp_list inversion list for use in addition or instead
14902      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14903      *        (Otherwise everything needed is already in av[0] and av[1])
14904      *  av[4] is set if any component of the class is from a user-defined
14905      *        property; used only if av[3] exists */
14906
14907     UV n;
14908
14909     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14910
14911     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14912         assert(! (ANYOF_FLAGS(node)
14913                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14914         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14915     }
14916     else {
14917         AV * const av = newAV();
14918         SV *rv;
14919
14920         assert(ANYOF_FLAGS(node)
14921                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14922
14923         av_store(av, 0, (runtime_defns)
14924                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14925         if (swash) {
14926             av_store(av, 1, swash);
14927             SvREFCNT_dec_NN(cp_list);
14928         }
14929         else {
14930             av_store(av, 1, &PL_sv_undef);
14931             if (cp_list) {
14932                 av_store(av, 3, cp_list);
14933                 av_store(av, 4, newSVuv(has_user_defined_property));
14934             }
14935         }
14936
14937         if (only_utf8_locale_list) {
14938             av_store(av, 2, only_utf8_locale_list);
14939         }
14940         else {
14941             av_store(av, 2, &PL_sv_undef);
14942         }
14943
14944         rv = newRV_noinc(MUTABLE_SV(av));
14945         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14946         RExC_rxi->data->data[n] = (void*)rv;
14947         ARG_SET(node, n);
14948     }
14949 }
14950
14951
14952 /* reg_skipcomment()
14953
14954    Absorbs an /x style # comments from the input stream.
14955    Returns true if there is more text remaining in the stream.
14956    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14957    terminates the pattern without including a newline.
14958
14959    Note its the callers responsibility to ensure that we are
14960    actually in /x mode
14961
14962 */
14963
14964 STATIC bool
14965 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14966 {
14967     bool ended = 0;
14968
14969     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14970
14971     while (RExC_parse < RExC_end)
14972         if (*RExC_parse++ == '\n') {
14973             ended = 1;
14974             break;
14975         }
14976     if (!ended) {
14977         /* we ran off the end of the pattern without ending
14978            the comment, so we have to add an \n when wrapping */
14979         RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14980         return 0;
14981     } else
14982         return 1;
14983 }
14984
14985 /* nextchar()
14986
14987    Advances the parse position, and optionally absorbs
14988    "whitespace" from the inputstream.
14989
14990    Without /x "whitespace" means (?#...) style comments only,
14991    with /x this means (?#...) and # comments and whitespace proper.
14992
14993    Returns the RExC_parse point from BEFORE the scan occurs.
14994
14995    This is the /x friendly way of saying RExC_parse++.
14996 */
14997
14998 STATIC char*
14999 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15000 {
15001     char* const retval = RExC_parse++;
15002
15003     PERL_ARGS_ASSERT_NEXTCHAR;
15004
15005     for (;;) {
15006         if (RExC_end - RExC_parse >= 3
15007             && *RExC_parse == '('
15008             && RExC_parse[1] == '?'
15009             && RExC_parse[2] == '#')
15010         {
15011             while (*RExC_parse != ')') {
15012                 if (RExC_parse == RExC_end)
15013                     FAIL("Sequence (?#... not terminated");
15014                 RExC_parse++;
15015             }
15016             RExC_parse++;
15017             continue;
15018         }
15019         if (RExC_flags & RXf_PMf_EXTENDED) {
15020             if (isSPACE(*RExC_parse)) {
15021                 RExC_parse++;
15022                 continue;
15023             }
15024             else if (*RExC_parse == '#') {
15025                 if ( reg_skipcomment( pRExC_state ) )
15026                     continue;
15027             }
15028         }
15029         return retval;
15030     }
15031 }
15032
15033 /*
15034 - reg_node - emit a node
15035 */
15036 STATIC regnode *                        /* Location. */
15037 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15038 {
15039     dVAR;
15040     regnode *ptr;
15041     regnode * const ret = RExC_emit;
15042     GET_RE_DEBUG_FLAGS_DECL;
15043
15044     PERL_ARGS_ASSERT_REG_NODE;
15045
15046     if (SIZE_ONLY) {
15047         SIZE_ALIGN(RExC_size);
15048         RExC_size += 1;
15049         return(ret);
15050     }
15051     if (RExC_emit >= RExC_emit_bound)
15052         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15053                    op, RExC_emit, RExC_emit_bound);
15054
15055     NODE_ALIGN_FILL(ret);
15056     ptr = ret;
15057     FILL_ADVANCE_NODE(ptr, op);
15058 #ifdef RE_TRACK_PATTERN_OFFSETS
15059     if (RExC_offsets) {         /* MJD */
15060         MJD_OFFSET_DEBUG(
15061               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15062               "reg_node", __LINE__,
15063               PL_reg_name[op],
15064               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15065                 ? "Overwriting end of array!\n" : "OK",
15066               (UV)(RExC_emit - RExC_emit_start),
15067               (UV)(RExC_parse - RExC_start),
15068               (UV)RExC_offsets[0]));
15069         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15070     }
15071 #endif
15072     RExC_emit = ptr;
15073     return(ret);
15074 }
15075
15076 /*
15077 - reganode - emit a node with an argument
15078 */
15079 STATIC regnode *                        /* Location. */
15080 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15081 {
15082     dVAR;
15083     regnode *ptr;
15084     regnode * const ret = RExC_emit;
15085     GET_RE_DEBUG_FLAGS_DECL;
15086
15087     PERL_ARGS_ASSERT_REGANODE;
15088
15089     if (SIZE_ONLY) {
15090         SIZE_ALIGN(RExC_size);
15091         RExC_size += 2;
15092         /*
15093            We can't do this:
15094
15095            assert(2==regarglen[op]+1);
15096
15097            Anything larger than this has to allocate the extra amount.
15098            If we changed this to be:
15099
15100            RExC_size += (1 + regarglen[op]);
15101
15102            then it wouldn't matter. Its not clear what side effect
15103            might come from that so its not done so far.
15104            -- dmq
15105         */
15106         return(ret);
15107     }
15108     if (RExC_emit >= RExC_emit_bound)
15109         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15110                    op, RExC_emit, RExC_emit_bound);
15111
15112     NODE_ALIGN_FILL(ret);
15113     ptr = ret;
15114     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15115 #ifdef RE_TRACK_PATTERN_OFFSETS
15116     if (RExC_offsets) {         /* MJD */
15117         MJD_OFFSET_DEBUG(
15118               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15119               "reganode",
15120               __LINE__,
15121               PL_reg_name[op],
15122               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15123               "Overwriting end of array!\n" : "OK",
15124               (UV)(RExC_emit - RExC_emit_start),
15125               (UV)(RExC_parse - RExC_start),
15126               (UV)RExC_offsets[0]));
15127         Set_Cur_Node_Offset;
15128     }
15129 #endif
15130     RExC_emit = ptr;
15131     return(ret);
15132 }
15133
15134 /*
15135 - reguni - emit (if appropriate) a Unicode character
15136 */
15137 PERL_STATIC_INLINE STRLEN
15138 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15139 {
15140     dVAR;
15141
15142     PERL_ARGS_ASSERT_REGUNI;
15143
15144     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15145 }
15146
15147 /*
15148 - reginsert - insert an operator in front of already-emitted operand
15149 *
15150 * Means relocating the operand.
15151 */
15152 STATIC void
15153 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15154 {
15155     dVAR;
15156     regnode *src;
15157     regnode *dst;
15158     regnode *place;
15159     const int offset = regarglen[(U8)op];
15160     const int size = NODE_STEP_REGNODE + offset;
15161     GET_RE_DEBUG_FLAGS_DECL;
15162
15163     PERL_ARGS_ASSERT_REGINSERT;
15164     PERL_UNUSED_ARG(depth);
15165 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15166     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15167     if (SIZE_ONLY) {
15168         RExC_size += size;
15169         return;
15170     }
15171
15172     src = RExC_emit;
15173     RExC_emit += size;
15174     dst = RExC_emit;
15175     if (RExC_open_parens) {
15176         int paren;
15177         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15178         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15179             if ( RExC_open_parens[paren] >= opnd ) {
15180                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15181                 RExC_open_parens[paren] += size;
15182             } else {
15183                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15184             }
15185             if ( RExC_close_parens[paren] >= opnd ) {
15186                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15187                 RExC_close_parens[paren] += size;
15188             } else {
15189                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15190             }
15191         }
15192     }
15193
15194     while (src > opnd) {
15195         StructCopy(--src, --dst, regnode);
15196 #ifdef RE_TRACK_PATTERN_OFFSETS
15197         if (RExC_offsets) {     /* MJD 20010112 */
15198             MJD_OFFSET_DEBUG(
15199                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15200                   "reg_insert",
15201                   __LINE__,
15202                   PL_reg_name[op],
15203                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15204                     ? "Overwriting end of array!\n" : "OK",
15205                   (UV)(src - RExC_emit_start),
15206                   (UV)(dst - RExC_emit_start),
15207                   (UV)RExC_offsets[0]));
15208             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15209             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15210         }
15211 #endif
15212     }
15213
15214
15215     place = opnd;               /* Op node, where operand used to be. */
15216 #ifdef RE_TRACK_PATTERN_OFFSETS
15217     if (RExC_offsets) {         /* MJD */
15218         MJD_OFFSET_DEBUG(
15219               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15220               "reginsert",
15221               __LINE__,
15222               PL_reg_name[op],
15223               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15224               ? "Overwriting end of array!\n" : "OK",
15225               (UV)(place - RExC_emit_start),
15226               (UV)(RExC_parse - RExC_start),
15227               (UV)RExC_offsets[0]));
15228         Set_Node_Offset(place, RExC_parse);
15229         Set_Node_Length(place, 1);
15230     }
15231 #endif
15232     src = NEXTOPER(place);
15233     FILL_ADVANCE_NODE(place, op);
15234     Zero(src, offset, regnode);
15235 }
15236
15237 /*
15238 - regtail - set the next-pointer at the end of a node chain of p to val.
15239 - SEE ALSO: regtail_study
15240 */
15241 /* TODO: All three parms should be const */
15242 STATIC void
15243 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15244                 const regnode *val,U32 depth)
15245 {
15246     dVAR;
15247     regnode *scan;
15248     GET_RE_DEBUG_FLAGS_DECL;
15249
15250     PERL_ARGS_ASSERT_REGTAIL;
15251 #ifndef DEBUGGING
15252     PERL_UNUSED_ARG(depth);
15253 #endif
15254
15255     if (SIZE_ONLY)
15256         return;
15257
15258     /* Find last node. */
15259     scan = p;
15260     for (;;) {
15261         regnode * const temp = regnext(scan);
15262         DEBUG_PARSE_r({
15263             SV * const mysv=sv_newmortal();
15264             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15265             regprop(RExC_rx, mysv, scan, NULL);
15266             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15267                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15268                     (temp == NULL ? "->" : ""),
15269                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15270             );
15271         });
15272         if (temp == NULL)
15273             break;
15274         scan = temp;
15275     }
15276
15277     if (reg_off_by_arg[OP(scan)]) {
15278         ARG_SET(scan, val - scan);
15279     }
15280     else {
15281         NEXT_OFF(scan) = val - scan;
15282     }
15283 }
15284
15285 #ifdef DEBUGGING
15286 /*
15287 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15288 - Look for optimizable sequences at the same time.
15289 - currently only looks for EXACT chains.
15290
15291 This is experimental code. The idea is to use this routine to perform
15292 in place optimizations on branches and groups as they are constructed,
15293 with the long term intention of removing optimization from study_chunk so
15294 that it is purely analytical.
15295
15296 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15297 to control which is which.
15298
15299 */
15300 /* TODO: All four parms should be const */
15301
15302 STATIC U8
15303 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15304                       const regnode *val,U32 depth)
15305 {
15306     dVAR;
15307     regnode *scan;
15308     U8 exact = PSEUDO;
15309 #ifdef EXPERIMENTAL_INPLACESCAN
15310     I32 min = 0;
15311 #endif
15312     GET_RE_DEBUG_FLAGS_DECL;
15313
15314     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15315
15316
15317     if (SIZE_ONLY)
15318         return exact;
15319
15320     /* Find last node. */
15321
15322     scan = p;
15323     for (;;) {
15324         regnode * const temp = regnext(scan);
15325 #ifdef EXPERIMENTAL_INPLACESCAN
15326         if (PL_regkind[OP(scan)] == EXACT) {
15327             bool unfolded_multi_char;   /* Unexamined in this routine */
15328             if (join_exact(pRExC_state, scan, &min,
15329                            &unfolded_multi_char, 1, val, depth+1))
15330                 return EXACT;
15331         }
15332 #endif
15333         if ( exact ) {
15334             switch (OP(scan)) {
15335                 case EXACT:
15336                 case EXACTF:
15337                 case EXACTFA_NO_TRIE:
15338                 case EXACTFA:
15339                 case EXACTFU:
15340                 case EXACTFU_SS:
15341                 case EXACTFL:
15342                         if( exact == PSEUDO )
15343                             exact= OP(scan);
15344                         else if ( exact != OP(scan) )
15345                             exact= 0;
15346                 case NOTHING:
15347                     break;
15348                 default:
15349                     exact= 0;
15350             }
15351         }
15352         DEBUG_PARSE_r({
15353             SV * const mysv=sv_newmortal();
15354             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15355             regprop(RExC_rx, mysv, scan, NULL);
15356             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15357                 SvPV_nolen_const(mysv),
15358                 REG_NODE_NUM(scan),
15359                 PL_reg_name[exact]);
15360         });
15361         if (temp == NULL)
15362             break;
15363         scan = temp;
15364     }
15365     DEBUG_PARSE_r({
15366         SV * const mysv_val=sv_newmortal();
15367         DEBUG_PARSE_MSG("");
15368         regprop(RExC_rx, mysv_val, val, NULL);
15369         PerlIO_printf(Perl_debug_log,
15370                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15371                       SvPV_nolen_const(mysv_val),
15372                       (IV)REG_NODE_NUM(val),
15373                       (IV)(val - scan)
15374         );
15375     });
15376     if (reg_off_by_arg[OP(scan)]) {
15377         ARG_SET(scan, val - scan);
15378     }
15379     else {
15380         NEXT_OFF(scan) = val - scan;
15381     }
15382
15383     return exact;
15384 }
15385 #endif
15386
15387 /*
15388  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15389  */
15390 #ifdef DEBUGGING
15391
15392 static void
15393 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15394 {
15395     int bit;
15396     int set=0;
15397
15398     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15399
15400     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15401         if (flags & (1<<bit)) {
15402             if (!set++ && lead)
15403                 PerlIO_printf(Perl_debug_log, "%s",lead);
15404             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15405         }
15406     }
15407     if (lead)  {
15408         if (set)
15409             PerlIO_printf(Perl_debug_log, "\n");
15410         else
15411             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15412     }
15413 }
15414
15415 static void
15416 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15417 {
15418     int bit;
15419     int set=0;
15420     regex_charset cs;
15421
15422     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15423
15424     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15425         if (flags & (1<<bit)) {
15426             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15427                 continue;
15428             }
15429             if (!set++ && lead)
15430                 PerlIO_printf(Perl_debug_log, "%s",lead);
15431             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15432         }
15433     }
15434     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15435             if (!set++ && lead) {
15436                 PerlIO_printf(Perl_debug_log, "%s",lead);
15437             }
15438             switch (cs) {
15439                 case REGEX_UNICODE_CHARSET:
15440                     PerlIO_printf(Perl_debug_log, "UNICODE");
15441                     break;
15442                 case REGEX_LOCALE_CHARSET:
15443                     PerlIO_printf(Perl_debug_log, "LOCALE");
15444                     break;
15445                 case REGEX_ASCII_RESTRICTED_CHARSET:
15446                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15447                     break;
15448                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15449                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15450                     break;
15451                 default:
15452                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15453                     break;
15454             }
15455     }
15456     if (lead)  {
15457         if (set)
15458             PerlIO_printf(Perl_debug_log, "\n");
15459         else
15460             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15461     }
15462 }
15463 #endif
15464
15465 void
15466 Perl_regdump(pTHX_ const regexp *r)
15467 {
15468 #ifdef DEBUGGING
15469     dVAR;
15470     SV * const sv = sv_newmortal();
15471     SV *dsv= sv_newmortal();
15472     RXi_GET_DECL(r,ri);
15473     GET_RE_DEBUG_FLAGS_DECL;
15474
15475     PERL_ARGS_ASSERT_REGDUMP;
15476
15477     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15478
15479     /* Header fields of interest. */
15480     if (r->anchored_substr) {
15481         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15482             RE_SV_DUMPLEN(r->anchored_substr), 30);
15483         PerlIO_printf(Perl_debug_log,
15484                       "anchored %s%s at %"IVdf" ",
15485                       s, RE_SV_TAIL(r->anchored_substr),
15486                       (IV)r->anchored_offset);
15487     } else if (r->anchored_utf8) {
15488         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15489             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15490         PerlIO_printf(Perl_debug_log,
15491                       "anchored utf8 %s%s at %"IVdf" ",
15492                       s, RE_SV_TAIL(r->anchored_utf8),
15493                       (IV)r->anchored_offset);
15494     }
15495     if (r->float_substr) {
15496         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15497             RE_SV_DUMPLEN(r->float_substr), 30);
15498         PerlIO_printf(Perl_debug_log,
15499                       "floating %s%s at %"IVdf"..%"UVuf" ",
15500                       s, RE_SV_TAIL(r->float_substr),
15501                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15502     } else if (r->float_utf8) {
15503         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15504             RE_SV_DUMPLEN(r->float_utf8), 30);
15505         PerlIO_printf(Perl_debug_log,
15506                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15507                       s, RE_SV_TAIL(r->float_utf8),
15508                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15509     }
15510     if (r->check_substr || r->check_utf8)
15511         PerlIO_printf(Perl_debug_log,
15512                       (const char *)
15513                       (r->check_substr == r->float_substr
15514                        && r->check_utf8 == r->float_utf8
15515                        ? "(checking floating" : "(checking anchored"));
15516     if (r->intflags & PREGf_NOSCAN)
15517         PerlIO_printf(Perl_debug_log, " noscan");
15518     if (r->extflags & RXf_CHECK_ALL)
15519         PerlIO_printf(Perl_debug_log, " isall");
15520     if (r->check_substr || r->check_utf8)
15521         PerlIO_printf(Perl_debug_log, ") ");
15522
15523     if (ri->regstclass) {
15524         regprop(r, sv, ri->regstclass, NULL);
15525         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15526     }
15527     if (r->intflags & PREGf_ANCH) {
15528         PerlIO_printf(Perl_debug_log, "anchored");
15529         if (r->intflags & PREGf_ANCH_BOL)
15530             PerlIO_printf(Perl_debug_log, "(BOL)");
15531         if (r->intflags & PREGf_ANCH_MBOL)
15532             PerlIO_printf(Perl_debug_log, "(MBOL)");
15533         if (r->intflags & PREGf_ANCH_SBOL)
15534             PerlIO_printf(Perl_debug_log, "(SBOL)");
15535         if (r->intflags & PREGf_ANCH_GPOS)
15536             PerlIO_printf(Perl_debug_log, "(GPOS)");
15537         PerlIO_putc(Perl_debug_log, ' ');
15538     }
15539     if (r->intflags & PREGf_GPOS_SEEN)
15540         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15541     if (r->intflags & PREGf_SKIP)
15542         PerlIO_printf(Perl_debug_log, "plus ");
15543     if (r->intflags & PREGf_IMPLICIT)
15544         PerlIO_printf(Perl_debug_log, "implicit ");
15545     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15546     if (r->extflags & RXf_EVAL_SEEN)
15547         PerlIO_printf(Perl_debug_log, "with eval ");
15548     PerlIO_printf(Perl_debug_log, "\n");
15549     DEBUG_FLAGS_r({
15550         regdump_extflags("r->extflags: ",r->extflags);
15551         regdump_intflags("r->intflags: ",r->intflags);
15552     });
15553 #else
15554     PERL_ARGS_ASSERT_REGDUMP;
15555     PERL_UNUSED_CONTEXT;
15556     PERL_UNUSED_ARG(r);
15557 #endif  /* DEBUGGING */
15558 }
15559
15560 /*
15561 - regprop - printable representation of opcode, with run time support
15562 */
15563
15564 void
15565 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15566 {
15567 #ifdef DEBUGGING
15568     dVAR;
15569     int k;
15570
15571     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15572     static const char * const anyofs[] = {
15573 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15574     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15575     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15576     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15577     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15578     || _CC_VERTSPACE != 16
15579   #error Need to adjust order of anyofs[]
15580 #endif
15581         "\\w",
15582         "\\W",
15583         "\\d",
15584         "\\D",
15585         "[:alpha:]",
15586         "[:^alpha:]",
15587         "[:lower:]",
15588         "[:^lower:]",
15589         "[:upper:]",
15590         "[:^upper:]",
15591         "[:punct:]",
15592         "[:^punct:]",
15593         "[:print:]",
15594         "[:^print:]",
15595         "[:alnum:]",
15596         "[:^alnum:]",
15597         "[:graph:]",
15598         "[:^graph:]",
15599         "[:cased:]",
15600         "[:^cased:]",
15601         "\\s",
15602         "\\S",
15603         "[:blank:]",
15604         "[:^blank:]",
15605         "[:xdigit:]",
15606         "[:^xdigit:]",
15607         "[:space:]",
15608         "[:^space:]",
15609         "[:cntrl:]",
15610         "[:^cntrl:]",
15611         "[:ascii:]",
15612         "[:^ascii:]",
15613         "\\v",
15614         "\\V"
15615     };
15616     RXi_GET_DECL(prog,progi);
15617     GET_RE_DEBUG_FLAGS_DECL;
15618
15619     PERL_ARGS_ASSERT_REGPROP;
15620
15621     sv_setpvs(sv, "");
15622
15623     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15624         /* It would be nice to FAIL() here, but this may be called from
15625            regexec.c, and it would be hard to supply pRExC_state. */
15626         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15627                                               (int)OP(o), (int)REGNODE_MAX);
15628     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15629
15630     k = PL_regkind[OP(o)];
15631
15632     if (k == EXACT) {
15633         sv_catpvs(sv, " ");
15634         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15635          * is a crude hack but it may be the best for now since
15636          * we have no flag "this EXACTish node was UTF-8"
15637          * --jhi */
15638         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15639                   PERL_PV_ESCAPE_UNI_DETECT |
15640                   PERL_PV_ESCAPE_NONASCII   |
15641                   PERL_PV_PRETTY_ELLIPSES   |
15642                   PERL_PV_PRETTY_LTGT       |
15643                   PERL_PV_PRETTY_NOCLEAR
15644                   );
15645     } else if (k == TRIE) {
15646         /* print the details of the trie in dumpuntil instead, as
15647          * progi->data isn't available here */
15648         const char op = OP(o);
15649         const U32 n = ARG(o);
15650         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15651                (reg_ac_data *)progi->data->data[n] :
15652                NULL;
15653         const reg_trie_data * const trie
15654             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15655
15656         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15657         DEBUG_TRIE_COMPILE_r(
15658           Perl_sv_catpvf(aTHX_ sv,
15659             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15660             (UV)trie->startstate,
15661             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15662             (UV)trie->wordcount,
15663             (UV)trie->minlen,
15664             (UV)trie->maxlen,
15665             (UV)TRIE_CHARCOUNT(trie),
15666             (UV)trie->uniquecharcount
15667           );
15668         );
15669         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15670             sv_catpvs(sv, "[");
15671             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15672                                                    ? ANYOF_BITMAP(o)
15673                                                    : TRIE_BITMAP(trie));
15674             sv_catpvs(sv, "]");
15675         }
15676
15677     } else if (k == CURLY) {
15678         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15679             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15680         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15681     }
15682     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15683         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15684     else if (k == REF || k == OPEN || k == CLOSE
15685              || k == GROUPP || OP(o)==ACCEPT)
15686     {
15687         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15688         if ( RXp_PAREN_NAMES(prog) ) {
15689             if ( k != REF || (OP(o) < NREF)) {
15690                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15691                 SV **name= av_fetch(list, ARG(o), 0 );
15692                 if (name)
15693                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15694             }
15695             else {
15696                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15697                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15698                 I32 *nums=(I32*)SvPVX(sv_dat);
15699                 SV **name= av_fetch(list, nums[0], 0 );
15700                 I32 n;
15701                 if (name) {
15702                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15703                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15704                                     (n ? "," : ""), (IV)nums[n]);
15705                     }
15706                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15707                 }
15708             }
15709         }
15710         if ( k == REF && reginfo) {
15711             U32 n = ARG(o);  /* which paren pair */
15712             I32 ln = prog->offs[n].start;
15713             if (prog->lastparen < n || ln == -1)
15714                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15715             else if (ln == prog->offs[n].end)
15716                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15717             else {
15718                 const char *s = reginfo->strbeg + ln;
15719                 Perl_sv_catpvf(aTHX_ sv, ": ");
15720                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15721                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15722             }
15723         }
15724     } else if (k == GOSUB)
15725         /* Paren and offset */
15726         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15727     else if (k == VERB) {
15728         if (!o->flags)
15729             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15730                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15731     } else if (k == LOGICAL)
15732         /* 2: embedded, otherwise 1 */
15733         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15734     else if (k == ANYOF) {
15735         const U8 flags = ANYOF_FLAGS(o);
15736         int do_sep = 0;
15737
15738
15739         if (flags & ANYOF_LOCALE_FLAGS)
15740             sv_catpvs(sv, "{loc}");
15741         if (flags & ANYOF_LOC_FOLD)
15742             sv_catpvs(sv, "{i}");
15743         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15744         if (flags & ANYOF_INVERT)
15745             sv_catpvs(sv, "^");
15746
15747         /* output what the standard cp 0-255 bitmap matches */
15748         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15749
15750         /* output any special charclass tests (used entirely under use
15751          * locale) * */
15752         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15753             int i;
15754             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15755                 if (ANYOF_POSIXL_TEST(o,i)) {
15756                     sv_catpv(sv, anyofs[i]);
15757                     do_sep = 1;
15758                 }
15759             }
15760         }
15761
15762         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15763                       |ANYOF_UTF8
15764                       |ANYOF_NONBITMAP_NON_UTF8
15765                       |ANYOF_LOC_FOLD)))
15766         {
15767             if (do_sep) {
15768                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15769                 if (flags & ANYOF_INVERT)
15770                     /*make sure the invert info is in each */
15771                     sv_catpvs(sv, "^");
15772             }
15773
15774             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15775                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15776             }
15777
15778             /* output information about the unicode matching */
15779             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15780                 sv_catpvs(sv, "{unicode_all}");
15781             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15782                 SV *lv; /* Set if there is something outside the bit map. */
15783                 bool byte_output = FALSE;   /* If something in the bitmap has
15784                                                been output */
15785                 SV *only_utf8_locale;
15786
15787                 /* Get the stuff that wasn't in the bitmap */
15788                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15789                                                     &lv, &only_utf8_locale);
15790                 if (lv && lv != &PL_sv_undef) {
15791                     char *s = savesvpv(lv);
15792                     char * const origs = s;
15793
15794                     while (*s && *s != '\n')
15795                         s++;
15796
15797                     if (*s == '\n') {
15798                         const char * const t = ++s;
15799
15800                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15801                             sv_catpvs(sv, "{outside bitmap}");
15802                         }
15803                         else {
15804                             sv_catpvs(sv, "{utf8}");
15805                         }
15806
15807                         if (byte_output) {
15808                             sv_catpvs(sv, " ");
15809                         }
15810
15811                         while (*s) {
15812                             if (*s == '\n') {
15813
15814                                 /* Truncate very long output */
15815                                 if (s - origs > 256) {
15816                                     Perl_sv_catpvf(aTHX_ sv,
15817                                                 "%.*s...",
15818                                                 (int) (s - origs - 1),
15819                                                 t);
15820                                     goto out_dump;
15821                                 }
15822                                 *s = ' ';
15823                             }
15824                             else if (*s == '\t') {
15825                                 *s = '-';
15826                             }
15827                             s++;
15828                         }
15829                         if (s[-1] == ' ')
15830                             s[-1] = 0;
15831
15832                         sv_catpv(sv, t);
15833                     }
15834
15835                 out_dump:
15836
15837                     Safefree(origs);
15838                     SvREFCNT_dec_NN(lv);
15839                 }
15840
15841                 if ((flags & ANYOF_LOC_FOLD)
15842                      && only_utf8_locale
15843                      && only_utf8_locale != &PL_sv_undef)
15844                 {
15845                     UV start, end;
15846                     int max_entries = 256;
15847
15848                     sv_catpvs(sv, "{utf8 locale}");
15849                     invlist_iterinit(only_utf8_locale);
15850                     while (invlist_iternext(only_utf8_locale,
15851                                             &start, &end)) {
15852                         put_range(sv, start, end);
15853                         max_entries --;
15854                         if (max_entries < 0) {
15855                             sv_catpvs(sv, "...");
15856                             break;
15857                         }
15858                     }
15859                     invlist_iterfinish(only_utf8_locale);
15860                 }
15861             }
15862         }
15863
15864         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15865     }
15866     else if (k == POSIXD || k == NPOSIXD) {
15867         U8 index = FLAGS(o) * 2;
15868         if (index < C_ARRAY_LENGTH(anyofs)) {
15869             if (*anyofs[index] != '[')  {
15870                 sv_catpv(sv, "[");
15871             }
15872             sv_catpv(sv, anyofs[index]);
15873             if (*anyofs[index] != '[')  {
15874                 sv_catpv(sv, "]");
15875             }
15876         }
15877         else {
15878             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15879         }
15880     }
15881     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15882         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15883 #else
15884     PERL_UNUSED_CONTEXT;
15885     PERL_UNUSED_ARG(sv);
15886     PERL_UNUSED_ARG(o);
15887     PERL_UNUSED_ARG(prog);
15888     PERL_UNUSED_ARG(reginfo);
15889 #endif  /* DEBUGGING */
15890 }
15891
15892
15893
15894 SV *
15895 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15896 {                               /* Assume that RE_INTUIT is set */
15897     dVAR;
15898     struct regexp *const prog = ReANY(r);
15899     GET_RE_DEBUG_FLAGS_DECL;
15900
15901     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15902     PERL_UNUSED_CONTEXT;
15903
15904     DEBUG_COMPILE_r(
15905         {
15906             const char * const s = SvPV_nolen_const(prog->check_substr
15907                       ? prog->check_substr : prog->check_utf8);
15908
15909             if (!PL_colorset) reginitcolors();
15910             PerlIO_printf(Perl_debug_log,
15911                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15912                       PL_colors[4],
15913                       prog->check_substr ? "" : "utf8 ",
15914                       PL_colors[5],PL_colors[0],
15915                       s,
15916                       PL_colors[1],
15917                       (strlen(s) > 60 ? "..." : ""));
15918         } );
15919
15920     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15921 }
15922
15923 /*
15924    pregfree()
15925
15926    handles refcounting and freeing the perl core regexp structure. When
15927    it is necessary to actually free the structure the first thing it
15928    does is call the 'free' method of the regexp_engine associated to
15929    the regexp, allowing the handling of the void *pprivate; member
15930    first. (This routine is not overridable by extensions, which is why
15931    the extensions free is called first.)
15932
15933    See regdupe and regdupe_internal if you change anything here.
15934 */
15935 #ifndef PERL_IN_XSUB_RE
15936 void
15937 Perl_pregfree(pTHX_ REGEXP *r)
15938 {
15939     SvREFCNT_dec(r);
15940 }
15941
15942 void
15943 Perl_pregfree2(pTHX_ REGEXP *rx)
15944 {
15945     dVAR;
15946     struct regexp *const r = ReANY(rx);
15947     GET_RE_DEBUG_FLAGS_DECL;
15948
15949     PERL_ARGS_ASSERT_PREGFREE2;
15950
15951     if (r->mother_re) {
15952         ReREFCNT_dec(r->mother_re);
15953     } else {
15954         CALLREGFREE_PVT(rx); /* free the private data */
15955         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15956         Safefree(r->xpv_len_u.xpvlenu_pv);
15957     }
15958     if (r->substrs) {
15959         SvREFCNT_dec(r->anchored_substr);
15960         SvREFCNT_dec(r->anchored_utf8);
15961         SvREFCNT_dec(r->float_substr);
15962         SvREFCNT_dec(r->float_utf8);
15963         Safefree(r->substrs);
15964     }
15965     RX_MATCH_COPY_FREE(rx);
15966 #ifdef PERL_ANY_COW
15967     SvREFCNT_dec(r->saved_copy);
15968 #endif
15969     Safefree(r->offs);
15970     SvREFCNT_dec(r->qr_anoncv);
15971     rx->sv_u.svu_rx = 0;
15972 }
15973
15974 /*  reg_temp_copy()
15975
15976     This is a hacky workaround to the structural issue of match results
15977     being stored in the regexp structure which is in turn stored in
15978     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15979     could be PL_curpm in multiple contexts, and could require multiple
15980     result sets being associated with the pattern simultaneously, such
15981     as when doing a recursive match with (??{$qr})
15982
15983     The solution is to make a lightweight copy of the regexp structure
15984     when a qr// is returned from the code executed by (??{$qr}) this
15985     lightweight copy doesn't actually own any of its data except for
15986     the starp/end and the actual regexp structure itself.
15987
15988 */
15989
15990
15991 REGEXP *
15992 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15993 {
15994     struct regexp *ret;
15995     struct regexp *const r = ReANY(rx);
15996     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15997
15998     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15999
16000     if (!ret_x)
16001         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16002     else {
16003         SvOK_off((SV *)ret_x);
16004         if (islv) {
16005             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16006                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16007                made both spots point to the same regexp body.) */
16008             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16009             assert(!SvPVX(ret_x));
16010             ret_x->sv_u.svu_rx = temp->sv_any;
16011             temp->sv_any = NULL;
16012             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16013             SvREFCNT_dec_NN(temp);
16014             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16015                ing below will not set it. */
16016             SvCUR_set(ret_x, SvCUR(rx));
16017         }
16018     }
16019     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16020        sv_force_normal(sv) is called.  */
16021     SvFAKE_on(ret_x);
16022     ret = ReANY(ret_x);
16023
16024     SvFLAGS(ret_x) |= SvUTF8(rx);
16025     /* We share the same string buffer as the original regexp, on which we
16026        hold a reference count, incremented when mother_re is set below.
16027        The string pointer is copied here, being part of the regexp struct.
16028      */
16029     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16030            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16031     if (r->offs) {
16032         const I32 npar = r->nparens+1;
16033         Newx(ret->offs, npar, regexp_paren_pair);
16034         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16035     }
16036     if (r->substrs) {
16037         Newx(ret->substrs, 1, struct reg_substr_data);
16038         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16039
16040         SvREFCNT_inc_void(ret->anchored_substr);
16041         SvREFCNT_inc_void(ret->anchored_utf8);
16042         SvREFCNT_inc_void(ret->float_substr);
16043         SvREFCNT_inc_void(ret->float_utf8);
16044
16045         /* check_substr and check_utf8, if non-NULL, point to either their
16046            anchored or float namesakes, and don't hold a second reference.  */
16047     }
16048     RX_MATCH_COPIED_off(ret_x);
16049 #ifdef PERL_ANY_COW
16050     ret->saved_copy = NULL;
16051 #endif
16052     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16053     SvREFCNT_inc_void(ret->qr_anoncv);
16054
16055     return ret_x;
16056 }
16057 #endif
16058
16059 /* regfree_internal()
16060
16061    Free the private data in a regexp. This is overloadable by
16062    extensions. Perl takes care of the regexp structure in pregfree(),
16063    this covers the *pprivate pointer which technically perl doesn't
16064    know about, however of course we have to handle the
16065    regexp_internal structure when no extension is in use.
16066
16067    Note this is called before freeing anything in the regexp
16068    structure.
16069  */
16070
16071 void
16072 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16073 {
16074     dVAR;
16075     struct regexp *const r = ReANY(rx);
16076     RXi_GET_DECL(r,ri);
16077     GET_RE_DEBUG_FLAGS_DECL;
16078
16079     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16080
16081     DEBUG_COMPILE_r({
16082         if (!PL_colorset)
16083             reginitcolors();
16084         {
16085             SV *dsv= sv_newmortal();
16086             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16087                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16088             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16089                 PL_colors[4],PL_colors[5],s);
16090         }
16091     });
16092 #ifdef RE_TRACK_PATTERN_OFFSETS
16093     if (ri->u.offsets)
16094         Safefree(ri->u.offsets);             /* 20010421 MJD */
16095 #endif
16096     if (ri->code_blocks) {
16097         int n;
16098         for (n = 0; n < ri->num_code_blocks; n++)
16099             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16100         Safefree(ri->code_blocks);
16101     }
16102
16103     if (ri->data) {
16104         int n = ri->data->count;
16105
16106         while (--n >= 0) {
16107           /* If you add a ->what type here, update the comment in regcomp.h */
16108             switch (ri->data->what[n]) {
16109             case 'a':
16110             case 'r':
16111             case 's':
16112             case 'S':
16113             case 'u':
16114                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16115                 break;
16116             case 'f':
16117                 Safefree(ri->data->data[n]);
16118                 break;
16119             case 'l':
16120             case 'L':
16121                 break;
16122             case 'T':
16123                 { /* Aho Corasick add-on structure for a trie node.
16124                      Used in stclass optimization only */
16125                     U32 refcount;
16126                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16127                     OP_REFCNT_LOCK;
16128                     refcount = --aho->refcount;
16129                     OP_REFCNT_UNLOCK;
16130                     if ( !refcount ) {
16131                         PerlMemShared_free(aho->states);
16132                         PerlMemShared_free(aho->fail);
16133                          /* do this last!!!! */
16134                         PerlMemShared_free(ri->data->data[n]);
16135                         PerlMemShared_free(ri->regstclass);
16136                     }
16137                 }
16138                 break;
16139             case 't':
16140                 {
16141                     /* trie structure. */
16142                     U32 refcount;
16143                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16144                     OP_REFCNT_LOCK;
16145                     refcount = --trie->refcount;
16146                     OP_REFCNT_UNLOCK;
16147                     if ( !refcount ) {
16148                         PerlMemShared_free(trie->charmap);
16149                         PerlMemShared_free(trie->states);
16150                         PerlMemShared_free(trie->trans);
16151                         if (trie->bitmap)
16152                             PerlMemShared_free(trie->bitmap);
16153                         if (trie->jump)
16154                             PerlMemShared_free(trie->jump);
16155                         PerlMemShared_free(trie->wordinfo);
16156                         /* do this last!!!! */
16157                         PerlMemShared_free(ri->data->data[n]);
16158                     }
16159                 }
16160                 break;
16161             default:
16162                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16163                                                     ri->data->what[n]);
16164             }
16165         }
16166         Safefree(ri->data->what);
16167         Safefree(ri->data);
16168     }
16169
16170     Safefree(ri);
16171 }
16172
16173 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16174 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16175 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16176
16177 /*
16178    re_dup - duplicate a regexp.
16179
16180    This routine is expected to clone a given regexp structure. It is only
16181    compiled under USE_ITHREADS.
16182
16183    After all of the core data stored in struct regexp is duplicated
16184    the regexp_engine.dupe method is used to copy any private data
16185    stored in the *pprivate pointer. This allows extensions to handle
16186    any duplication it needs to do.
16187
16188    See pregfree() and regfree_internal() if you change anything here.
16189 */
16190 #if defined(USE_ITHREADS)
16191 #ifndef PERL_IN_XSUB_RE
16192 void
16193 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16194 {
16195     dVAR;
16196     I32 npar;
16197     const struct regexp *r = ReANY(sstr);
16198     struct regexp *ret = ReANY(dstr);
16199
16200     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16201
16202     npar = r->nparens+1;
16203     Newx(ret->offs, npar, regexp_paren_pair);
16204     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16205
16206     if (ret->substrs) {
16207         /* Do it this way to avoid reading from *r after the StructCopy().
16208            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16209            cache, it doesn't matter.  */
16210         const bool anchored = r->check_substr
16211             ? r->check_substr == r->anchored_substr
16212             : r->check_utf8 == r->anchored_utf8;
16213         Newx(ret->substrs, 1, struct reg_substr_data);
16214         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16215
16216         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16217         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16218         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16219         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16220
16221         /* check_substr and check_utf8, if non-NULL, point to either their
16222            anchored or float namesakes, and don't hold a second reference.  */
16223
16224         if (ret->check_substr) {
16225             if (anchored) {
16226                 assert(r->check_utf8 == r->anchored_utf8);
16227                 ret->check_substr = ret->anchored_substr;
16228                 ret->check_utf8 = ret->anchored_utf8;
16229             } else {
16230                 assert(r->check_substr == r->float_substr);
16231                 assert(r->check_utf8 == r->float_utf8);
16232                 ret->check_substr = ret->float_substr;
16233                 ret->check_utf8 = ret->float_utf8;
16234             }
16235         } else if (ret->check_utf8) {
16236             if (anchored) {
16237                 ret->check_utf8 = ret->anchored_utf8;
16238             } else {
16239                 ret->check_utf8 = ret->float_utf8;
16240             }
16241         }
16242     }
16243
16244     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16245     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16246
16247     if (ret->pprivate)
16248         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16249
16250     if (RX_MATCH_COPIED(dstr))
16251         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16252     else
16253         ret->subbeg = NULL;
16254 #ifdef PERL_ANY_COW
16255     ret->saved_copy = NULL;
16256 #endif
16257
16258     /* Whether mother_re be set or no, we need to copy the string.  We
16259        cannot refrain from copying it when the storage points directly to
16260        our mother regexp, because that's
16261                1: a buffer in a different thread
16262                2: something we no longer hold a reference on
16263                so we need to copy it locally.  */
16264     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16265     ret->mother_re   = NULL;
16266 }
16267 #endif /* PERL_IN_XSUB_RE */
16268
16269 /*
16270    regdupe_internal()
16271
16272    This is the internal complement to regdupe() which is used to copy
16273    the structure pointed to by the *pprivate pointer in the regexp.
16274    This is the core version of the extension overridable cloning hook.
16275    The regexp structure being duplicated will be copied by perl prior
16276    to this and will be provided as the regexp *r argument, however
16277    with the /old/ structures pprivate pointer value. Thus this routine
16278    may override any copying normally done by perl.
16279
16280    It returns a pointer to the new regexp_internal structure.
16281 */
16282
16283 void *
16284 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16285 {
16286     dVAR;
16287     struct regexp *const r = ReANY(rx);
16288     regexp_internal *reti;
16289     int len;
16290     RXi_GET_DECL(r,ri);
16291
16292     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16293
16294     len = ProgLen(ri);
16295
16296     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16297           char, regexp_internal);
16298     Copy(ri->program, reti->program, len+1, regnode);
16299
16300     reti->num_code_blocks = ri->num_code_blocks;
16301     if (ri->code_blocks) {
16302         int n;
16303         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16304                 struct reg_code_block);
16305         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16306                 struct reg_code_block);
16307         for (n = 0; n < ri->num_code_blocks; n++)
16308              reti->code_blocks[n].src_regex = (REGEXP*)
16309                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16310     }
16311     else
16312         reti->code_blocks = NULL;
16313
16314     reti->regstclass = NULL;
16315
16316     if (ri->data) {
16317         struct reg_data *d;
16318         const int count = ri->data->count;
16319         int i;
16320
16321         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16322                 char, struct reg_data);
16323         Newx(d->what, count, U8);
16324
16325         d->count = count;
16326         for (i = 0; i < count; i++) {
16327             d->what[i] = ri->data->what[i];
16328             switch (d->what[i]) {
16329                 /* see also regcomp.h and regfree_internal() */
16330             case 'a': /* actually an AV, but the dup function is identical.  */
16331             case 'r':
16332             case 's':
16333             case 'S':
16334             case 'u': /* actually an HV, but the dup function is identical.  */
16335                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16336                 break;
16337             case 'f':
16338                 /* This is cheating. */
16339                 Newx(d->data[i], 1, regnode_ssc);
16340                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16341                 reti->regstclass = (regnode*)d->data[i];
16342                 break;
16343             case 'T':
16344                 /* Trie stclasses are readonly and can thus be shared
16345                  * without duplication. We free the stclass in pregfree
16346                  * when the corresponding reg_ac_data struct is freed.
16347                  */
16348                 reti->regstclass= ri->regstclass;
16349                 /* Fall through */
16350             case 't':
16351                 OP_REFCNT_LOCK;
16352                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16353                 OP_REFCNT_UNLOCK;
16354                 /* Fall through */
16355             case 'l':
16356             case 'L':
16357                 d->data[i] = ri->data->data[i];
16358                 break;
16359             default:
16360                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16361                                                            ri->data->what[i]);
16362             }
16363         }
16364
16365         reti->data = d;
16366     }
16367     else
16368         reti->data = NULL;
16369
16370     reti->name_list_idx = ri->name_list_idx;
16371
16372 #ifdef RE_TRACK_PATTERN_OFFSETS
16373     if (ri->u.offsets) {
16374         Newx(reti->u.offsets, 2*len+1, U32);
16375         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16376     }
16377 #else
16378     SetProgLen(reti,len);
16379 #endif
16380
16381     return (void*)reti;
16382 }
16383
16384 #endif    /* USE_ITHREADS */
16385
16386 #ifndef PERL_IN_XSUB_RE
16387
16388 /*
16389  - regnext - dig the "next" pointer out of a node
16390  */
16391 regnode *
16392 Perl_regnext(pTHX_ regnode *p)
16393 {
16394     dVAR;
16395     I32 offset;
16396
16397     if (!p)
16398         return(NULL);
16399
16400     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16401         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16402                                                 (int)OP(p), (int)REGNODE_MAX);
16403     }
16404
16405     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16406     if (offset == 0)
16407         return(NULL);
16408
16409     return(p+offset);
16410 }
16411 #endif
16412
16413 STATIC void
16414 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16415 {
16416     va_list args;
16417     STRLEN l1 = strlen(pat1);
16418     STRLEN l2 = strlen(pat2);
16419     char buf[512];
16420     SV *msv;
16421     const char *message;
16422
16423     PERL_ARGS_ASSERT_RE_CROAK2;
16424
16425     if (l1 > 510)
16426         l1 = 510;
16427     if (l1 + l2 > 510)
16428         l2 = 510 - l1;
16429     Copy(pat1, buf, l1 , char);
16430     Copy(pat2, buf + l1, l2 , char);
16431     buf[l1 + l2] = '\n';
16432     buf[l1 + l2 + 1] = '\0';
16433     va_start(args, pat2);
16434     msv = vmess(buf, &args);
16435     va_end(args);
16436     message = SvPV_const(msv,l1);
16437     if (l1 > 512)
16438         l1 = 512;
16439     Copy(message, buf, l1 , char);
16440     /* l1-1 to avoid \n */
16441     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16442 }
16443
16444 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16445
16446 #ifndef PERL_IN_XSUB_RE
16447 void
16448 Perl_save_re_context(pTHX)
16449 {
16450     dVAR;
16451
16452     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16453     if (PL_curpm) {
16454         const REGEXP * const rx = PM_GETRE(PL_curpm);
16455         if (rx) {
16456             U32 i;
16457             for (i = 1; i <= RX_NPARENS(rx); i++) {
16458                 char digits[TYPE_CHARS(long)];
16459                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16460                                                "%lu", (long)i);
16461                 GV *const *const gvp
16462                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16463
16464                 if (gvp) {
16465                     GV * const gv = *gvp;
16466                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16467                         save_scalar(gv);
16468                 }
16469             }
16470         }
16471     }
16472 }
16473 #endif
16474
16475 #ifdef DEBUGGING
16476
16477 STATIC void
16478 S_put_byte(pTHX_ SV *sv, int c)
16479 {
16480     PERL_ARGS_ASSERT_PUT_BYTE;
16481
16482     if (!isPRINT(c)) {
16483         switch (c) {
16484             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16485             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16486             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16487             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16488             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16489
16490             default:
16491                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16492                 break;
16493         }
16494     }
16495     else {
16496         const char string = c;
16497         if (c == '-' || c == ']' || c == '\\' || c == '^')
16498             sv_catpvs(sv, "\\");
16499         sv_catpvn(sv, &string, 1);
16500     }
16501 }
16502
16503 STATIC void
16504 S_put_range(pTHX_ SV *sv, UV start, UV end)
16505 {
16506
16507     /* Appends to 'sv' a displayable version of the range of code points from
16508      * 'start' to 'end' */
16509
16510     assert(start <= end);
16511
16512     PERL_ARGS_ASSERT_PUT_RANGE;
16513
16514     if (end - start < 3) {  /* Individual chars in short ranges */
16515         for (; start <= end; start++)
16516             put_byte(sv, start);
16517     }
16518     else if (   end > 255
16519              || ! isALPHANUMERIC(start)
16520              || ! isALPHANUMERIC(end)
16521              || isDIGIT(start) != isDIGIT(end)
16522              || isUPPER(start) != isUPPER(end)
16523              || isLOWER(start) != isLOWER(end)
16524
16525                 /* This final test should get optimized out except on EBCDIC
16526                  * platforms, where it causes ranges that cross discontinuities
16527                  * like i/j to be shown as hex instead of the misleading,
16528                  * e.g. H-K (since that range includes more than H, I, J, K).
16529                  * */
16530              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16531     {
16532         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16533                        start,
16534                        (end < 256) ? end : 255);
16535     }
16536     else { /* Here, the ends of the range are both digits, or both uppercase,
16537               or both lowercase; and there's no discontinuity in the range
16538               (which could happen on EBCDIC platforms) */
16539         put_byte(sv, start);
16540         sv_catpvs(sv, "-");
16541         put_byte(sv, end);
16542     }
16543 }
16544
16545 STATIC bool
16546 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16547 {
16548     /* Appends to 'sv' a displayable version of the innards of the bracketed
16549      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16550      * output anything */
16551
16552     int i;
16553     bool has_output_anything = FALSE;
16554
16555     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16556
16557     for (i = 0; i < 256; i++) {
16558         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16559
16560             /* The character at index i should be output.  Find the next
16561              * character that should NOT be output */
16562             int j;
16563             for (j = i + 1; j <= 256; j++) {
16564                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16565                     break;
16566                 }
16567             }
16568
16569             /* Everything between them is a single range that should be output
16570              * */
16571             put_range(sv, i, j - 1);
16572             has_output_anything = TRUE;
16573             i = j;
16574         }
16575     }
16576
16577     return has_output_anything;
16578 }
16579
16580 #define CLEAR_OPTSTART \
16581     if (optstart) STMT_START {                                               \
16582         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16583                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16584         optstart=NULL;                                                       \
16585     } STMT_END
16586
16587 #define DUMPUNTIL(b,e)                                                       \
16588                     CLEAR_OPTSTART;                                          \
16589                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16590
16591 STATIC const regnode *
16592 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16593             const regnode *last, const regnode *plast,
16594             SV* sv, I32 indent, U32 depth)
16595 {
16596     dVAR;
16597     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16598     const regnode *next;
16599     const regnode *optstart= NULL;
16600
16601     RXi_GET_DECL(r,ri);
16602     GET_RE_DEBUG_FLAGS_DECL;
16603
16604     PERL_ARGS_ASSERT_DUMPUNTIL;
16605
16606 #ifdef DEBUG_DUMPUNTIL
16607     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16608         last ? last-start : 0,plast ? plast-start : 0);
16609 #endif
16610
16611     if (plast && plast < last)
16612         last= plast;
16613
16614     while (PL_regkind[op] != END && (!last || node < last)) {
16615         /* While that wasn't END last time... */
16616         NODE_ALIGN(node);
16617         op = OP(node);
16618         if (op == CLOSE || op == WHILEM)
16619             indent--;
16620         next = regnext((regnode *)node);
16621
16622         /* Where, what. */
16623         if (OP(node) == OPTIMIZED) {
16624             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16625                 optstart = node;
16626             else
16627                 goto after_print;
16628         } else
16629             CLEAR_OPTSTART;
16630
16631         regprop(r, sv, node, NULL);
16632         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16633                       (int)(2*indent + 1), "", SvPVX_const(sv));
16634
16635         if (OP(node) != OPTIMIZED) {
16636             if (next == NULL)           /* Next ptr. */
16637                 PerlIO_printf(Perl_debug_log, " (0)");
16638             else if (PL_regkind[(U8)op] == BRANCH
16639                      && PL_regkind[OP(next)] != BRANCH )
16640                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16641             else
16642                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16643             (void)PerlIO_putc(Perl_debug_log, '\n');
16644         }
16645
16646       after_print:
16647         if (PL_regkind[(U8)op] == BRANCHJ) {
16648             assert(next);
16649             {
16650                 const regnode *nnode = (OP(next) == LONGJMP
16651                                        ? regnext((regnode *)next)
16652                                        : next);
16653                 if (last && nnode > last)
16654                     nnode = last;
16655                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16656             }
16657         }
16658         else if (PL_regkind[(U8)op] == BRANCH) {
16659             assert(next);
16660             DUMPUNTIL(NEXTOPER(node), next);
16661         }
16662         else if ( PL_regkind[(U8)op]  == TRIE ) {
16663             const regnode *this_trie = node;
16664             const char op = OP(node);
16665             const U32 n = ARG(node);
16666             const reg_ac_data * const ac = op>=AHOCORASICK ?
16667                (reg_ac_data *)ri->data->data[n] :
16668                NULL;
16669             const reg_trie_data * const trie =
16670                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16671 #ifdef DEBUGGING
16672             AV *const trie_words
16673                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16674 #endif
16675             const regnode *nextbranch= NULL;
16676             I32 word_idx;
16677             sv_setpvs(sv, "");
16678             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16679                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16680
16681                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16682                    (int)(2*(indent+3)), "",
16683                     elem_ptr
16684                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16685                                 SvCUR(*elem_ptr), 60,
16686                                 PL_colors[0], PL_colors[1],
16687                                 (SvUTF8(*elem_ptr)
16688                                  ? PERL_PV_ESCAPE_UNI
16689                                  : 0)
16690                                 | PERL_PV_PRETTY_ELLIPSES
16691                                 | PERL_PV_PRETTY_LTGT
16692                             )
16693                     : "???"
16694                 );
16695                 if (trie->jump) {
16696                     U16 dist= trie->jump[word_idx+1];
16697                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16698                                (UV)((dist ? this_trie + dist : next) - start));
16699                     if (dist) {
16700                         if (!nextbranch)
16701                             nextbranch= this_trie + trie->jump[0];
16702                         DUMPUNTIL(this_trie + dist, nextbranch);
16703                     }
16704                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16705                         nextbranch= regnext((regnode *)nextbranch);
16706                 } else {
16707                     PerlIO_printf(Perl_debug_log, "\n");
16708                 }
16709             }
16710             if (last && next > last)
16711                 node= last;
16712             else
16713                 node= next;
16714         }
16715         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16716             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16717                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16718         }
16719         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16720             assert(next);
16721             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16722         }
16723         else if ( op == PLUS || op == STAR) {
16724             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16725         }
16726         else if (PL_regkind[(U8)op] == ANYOF) {
16727             /* arglen 1 + class block */
16728             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16729                           ? ANYOF_POSIXL_SKIP
16730                           : ANYOF_SKIP);
16731             node = NEXTOPER(node);
16732         }
16733         else if (PL_regkind[(U8)op] == EXACT) {
16734             /* Literal string, where present. */
16735             node += NODE_SZ_STR(node) - 1;
16736             node = NEXTOPER(node);
16737         }
16738         else {
16739             node = NEXTOPER(node);
16740             node += regarglen[(U8)op];
16741         }
16742         if (op == CURLYX || op == OPEN)
16743             indent++;
16744     }
16745     CLEAR_OPTSTART;
16746 #ifdef DEBUG_DUMPUNTIL
16747     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16748 #endif
16749     return node;
16750 }
16751
16752 #endif  /* DEBUGGING */
16753
16754 /*
16755  * Local variables:
16756  * c-indentation-style: bsd
16757  * c-basic-offset: 4
16758  * indent-tabs-mode: nil
16759  * End:
16760  *
16761  * ex: set ts=8 sts=4 sw=4 et:
16762  */