regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 130                                            implies compiling, so don't emit */
 131     regnode     emit_dummy;             /* placeholder for emit to point to */
 132     I32         naughty;                /* How bad is this pattern? */
 133     I32         sawback;                /* Did we see \1, ...? */
 134     U32         seen;
 135     SSize_t     size;                   /* Code size. */
 136     I32         npar;                   /* Capture buffer count, (OPEN). */
 137     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 138     I32         nestroot;               /* root parens we are in - used by accept */
 139     I32         extralen;
 140     I32         seen_zerolen;
 141     regnode     **open_parens;          /* pointers to open parens */
 142     regnode     **close_parens;         /* pointers to close parens */
 143     regnode     *opend;                 /* END node in program */
 144     I32         utf8;           /* whether the pattern is utf8 or not */
 145     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 146                                 /* XXX use this for future optimisation of case
 147                                  * where pattern must be upgraded to utf8. */
 148     I32         uni_semantics;  /* If a d charset modifier should use unicode
 149                                    rules, even if the pattern is not in
 150                                    utf8 */
 151     HV          *paren_names;           /* Paren names */
 152
 153     regnode     **recurse;              /* Recurse regops */
 154     I32         recurse_count;          /* Number of recurse regops */
 155     I32         in_lookbehind;
 156     I32         contains_locale;
 157     I32         override_recoding;
 158     I32         in_multi_char_class;
 159     struct reg_code_block *code_blocks; /* positions of literal (?{})
 160                                             within pattern */
 161     int         num_code_blocks;        /* size of code_blocks[] */
 162     int         code_index;             /* next code_blocks[] slot */
 163 #if ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 } RExC_state_t;
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 190 #endif
 191 #define RExC_emit       (pRExC_state->emit)
 192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 193 #define RExC_emit_start (pRExC_state->emit_start)
 194 #define RExC_emit_bound (pRExC_state->emit_bound)
 195 #define RExC_naughty    (pRExC_state->naughty)
 196 #define RExC_sawback    (pRExC_state->sawback)
 197 #define RExC_seen       (pRExC_state->seen)
 198 #define RExC_size       (pRExC_state->size)
 199 #define RExC_npar       (pRExC_state->npar)
 200 #define RExC_nestroot   (pRExC_state->nestroot)
 201 #define RExC_extralen   (pRExC_state->extralen)
 202 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 203 #define RExC_utf8       (pRExC_state->utf8)
 204 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 205 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 206 #define RExC_open_parens        (pRExC_state->open_parens)
 207 #define RExC_close_parens       (pRExC_state->close_parens)
 208 #define RExC_opend      (pRExC_state->opend)
 209 #define RExC_paren_names        (pRExC_state->paren_names)
 210 #define RExC_recurse    (pRExC_state->recurse)
 211 #define RExC_recurse_count      (pRExC_state->recurse_count)
 212 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 213 #define RExC_contains_locale    (pRExC_state->contains_locale)
 214 #define RExC_override_recoding (pRExC_state->override_recoding)
 215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 216
 217
 218 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 219 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 220         ((*s) == '{' && regcurly(s, FALSE)))
 221
 222 #ifdef SPSTART
 223 #undef SPSTART          /* dratted cpp namespace... */
 224 #endif
 225 /*
 226  * Flags to be passed up and down.
 227  */
 228 #define WORST           0       /* Worst case. */
 229 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 230
 231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 232  * character.  (There needs to be a case: in the switch statement in regexec.c
 233  * for any node marked SIMPLE.)  Note that this is not the same thing as
 234  * REGNODE_SIMPLE */
 235 #define SIMPLE          0x02
 236 #define SPSTART         0x04    /* Starts with * or + */
 237 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 238 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 239 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 #define REQUIRE_UTF8    STMT_START {                                       \
 259                                      if (!UTF) {                           \
 260                                          *flagp = RESTART_UTF8;            \
 261                                          return NULL;                      \
 262                                      }                                     \
 263                         } STMT_END
 264
 265 /* This converts the named class defined in regcomp.h to its equivalent class
 266  * number defined in handy.h. */
 267 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 268 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 269
 270 /* About scan_data_t.
 271
 272   During optimisation we recurse through the regexp program performing
 273   various inplace (keyhole style) optimisations. In addition study_chunk
 274   and scan_commit populate this data structure with information about
 275   what strings MUST appear in the pattern. We look for the longest
 276   string that must appear at a fixed location, and we look for the
 277   longest string that may appear at a floating location. So for instance
 278   in the pattern:
 279
 280     /FOO[xX]A.*B[xX]BAR/
 281
 282   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 283   strings (because they follow a .* construct). study_chunk will identify
 284   both FOO and BAR as being the longest fixed and floating strings respectively.
 285
 286   The strings can be composites, for instance
 287
 288      /(f)(o)(o)/
 289
 290   will result in a composite fixed substring 'foo'.
 291
 292   For each string some basic information is maintained:
 293
 294   - offset or min_offset
 295     This is the position the string must appear at, or not before.
 296     It also implicitly (when combined with minlenp) tells us how many
 297     characters must match before the string we are searching for.
 298     Likewise when combined with minlenp and the length of the string it
 299     tells us how many characters must appear after the string we have
 300     found.
 301
 302   - max_offset
 303     Only used for floating strings. This is the rightmost point that
 304     the string can appear at. If set to SSize_t_MAX it indicates that the
 305     string can occur infinitely far to the right.
 306
 307   - minlenp
 308     A pointer to the minimum number of characters of the pattern that the
 309     string was found inside. This is important as in the case of positive
 310     lookahead or positive lookbehind we can have multiple patterns
 311     involved. Consider
 312
 313     /(?=FOO).*F/
 314
 315     The minimum length of the pattern overall is 3, the minimum length
 316     of the lookahead part is 3, but the minimum length of the part that
 317     will actually match is 1. So 'FOO's minimum length is 3, but the
 318     minimum length for the F is 1. This is important as the minimum length
 319     is used to determine offsets in front of and behind the string being
 320     looked for.  Since strings can be composites this is the length of the
 321     pattern at the time it was committed with a scan_commit. Note that
 322     the length is calculated by study_chunk, so that the minimum lengths
 323     are not known until the full pattern has been compiled, thus the
 324     pointer to the value.
 325
 326   - lookbehind
 327
 328     In the case of lookbehind the string being searched for can be
 329     offset past the start point of the final matching string.
 330     If this value was just blithely removed from the min_offset it would
 331     invalidate some of the calculations for how many chars must match
 332     before or after (as they are derived from min_offset and minlen and
 333     the length of the string being searched for).
 334     When the final pattern is compiled and the data is moved from the
 335     scan_data_t structure into the regexp structure the information
 336     about lookbehind is factored in, with the information that would
 337     have been lost precalculated in the end_shift field for the
 338     associated string.
 339
 340   The fields pos_min and pos_delta are used to store the minimum offset
 341   and the delta to the maximum offset at the current point in the pattern.
 342
 343 */
 344
 345 typedef struct scan_data_t {
 346     /*I32 len_min;      unused */
 347     /*I32 len_delta;    unused */
 348     SSize_t pos_min;
 349     SSize_t pos_delta;
 350     SV *last_found;
 351     SSize_t last_end;       /* min value, <0 unless valid. */
 352     SSize_t last_start_min;
 353     SSize_t last_start_max;
 354     SV **longest;           /* Either &l_fixed, or &l_float. */
 355     SV *longest_fixed;      /* longest fixed string found in pattern */
 356     SSize_t offset_fixed;   /* offset where it starts */
 357     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 359     SV *longest_float;      /* longest floating string found in pattern */
 360     SSize_t offset_float_min; /* earliest point in string it can appear */
 361     SSize_t offset_float_max; /* latest point in string it can appear */
 362     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 363     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 364     I32 flags;
 365     I32 whilem_c;
 366     SSize_t *last_closep;
 367     struct regnode_charclass_class *start_class;
 368 } scan_data_t;
 369
 370 /* The below is perhaps overboard, but this allows us to save a test at the
 371  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 372  * and 'a' differ by a single bit; the same with the upper and lower case of
 373  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 374  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 375  * then inverts it to form a mask, with just a single 0, in the bit position
 376  * where the upper- and lowercase differ.  XXX There are about 40 other
 377  * instances in the Perl core where this micro-optimization could be used.
 378  * Should decide if maintenance cost is worse, before changing those
 379  *
 380  * Returns a boolean as to whether or not 'v' is either a lowercase or
 381  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 382  * compile-time constant, the generated code is better than some optimizing
 383  * compilers figure out, amounting to a mask and test.  The results are
 384  * meaningless if 'c' is not one of [A-Za-z] */
 385 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 386                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 387
 388 /*
 389  * Forward declarations for pregcomp()'s friends.
 390  */
 391
 392 static const scan_data_t zero_scan_data =
 393   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 394
 395 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 396 #define SF_BEFORE_SEOL          0x0001
 397 #define SF_BEFORE_MEOL          0x0002
 398 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 399 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 400
 401 #ifdef NO_UNARY_PLUS
 402 #  define SF_FIX_SHIFT_EOL      (0+2)
 403 #  define SF_FL_SHIFT_EOL               (0+4)
 404 #else
 405 #  define SF_FIX_SHIFT_EOL      (+2)
 406 #  define SF_FL_SHIFT_EOL               (+4)
 407 #endif
 408
 409 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 410 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 411
 412 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 413 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 414 #define SF_IS_INF               0x0040
 415 #define SF_HAS_PAR              0x0080
 416 #define SF_IN_PAR               0x0100
 417 #define SF_HAS_EVAL             0x0200
 418 #define SCF_DO_SUBSTR           0x0400
 419 #define SCF_DO_STCLASS_AND      0x0800
 420 #define SCF_DO_STCLASS_OR       0x1000
 421 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 422 #define SCF_WHILEM_VISITED_POS  0x2000
 423
 424 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 425 #define SCF_SEEN_ACCEPT         0x8000
 426 #define SCF_TRIE_DOING_RESTUDY 0x10000
 427
 428 #define UTF cBOOL(RExC_utf8)
 429
 430 /* The enums for all these are ordered so things work out correctly */
 431 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 432 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 433 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 434 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 435 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 436 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 437 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 438
 439 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 440
 441 #define OOB_NAMEDCLASS          -1
 442
 443 /* There is no code point that is out-of-bounds, so this is problematic.  But
 444  * its only current use is to initialize a variable that is always set before
 445  * looked at. */
 446 #define OOB_UNICODE             0xDEADBEEF
 447
 448 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 449 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 450
 451
 452 /* length of regex to show in messages that don't mark a position within */
 453 #define RegexLengthToShowInErrorMessages 127
 454
 455 /*
 456  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 457  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 458  * op/pragma/warn/regcomp.
 459  */
 460 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 461 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 462
 463 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 464
 465 /*
 466  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 467  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 468  * "...".
 469  */
 470 #define _FAIL(code) STMT_START {                                        \
 471     const char *ellipses = "";                                          \
 472     IV len = RExC_end - RExC_precomp;                                   \
 473                                                                         \
 474     if (!SIZE_ONLY)                                                     \
 475         SAVEFREESV(RExC_rx_sv);                                         \
 476     if (len > RegexLengthToShowInErrorMessages) {                       \
 477         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 478         len = RegexLengthToShowInErrorMessages - 10;                    \
 479         ellipses = "...";                                               \
 480     }                                                                   \
 481     code;                                                               \
 482 } STMT_END
 483
 484 #define FAIL(msg) _FAIL(                            \
 485     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 486             msg, (int)len, RExC_precomp, ellipses))
 487
 488 #define FAIL2(msg,arg) _FAIL(                       \
 489     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 490             arg, (int)len, RExC_precomp, ellipses))
 491
 492 /*
 493  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 494  */
 495 #define Simple_vFAIL(m) STMT_START {                                    \
 496     const IV offset = RExC_parse - RExC_precomp;                        \
 497     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 498             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 499 } STMT_END
 500
 501 /*
 502  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 503  */
 504 #define vFAIL(m) STMT_START {                           \
 505     if (!SIZE_ONLY)                                     \
 506         SAVEFREESV(RExC_rx_sv);                         \
 507     Simple_vFAIL(m);                                    \
 508 } STMT_END
 509
 510 /*
 511  * Like Simple_vFAIL(), but accepts two arguments.
 512  */
 513 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 516             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 521  */
 522 #define vFAIL2(m,a1) STMT_START {                       \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL2(m, a1);                               \
 526 } STMT_END
 527
 528
 529 /*
 530  * Like Simple_vFAIL(), but accepts three arguments.
 531  */
 532 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 533     const IV offset = RExC_parse - RExC_precomp;                \
 534     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 535             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 536 } STMT_END
 537
 538 /*
 539  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 540  */
 541 #define vFAIL3(m,a1,a2) STMT_START {                    \
 542     if (!SIZE_ONLY)                                     \
 543         SAVEFREESV(RExC_rx_sv);                         \
 544     Simple_vFAIL3(m, a1, a2);                           \
 545 } STMT_END
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts four arguments.
 549  */
 550 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 553             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 554 } STMT_END
 555
 556 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 557     if (!SIZE_ONLY)                                     \
 558         SAVEFREESV(RExC_rx_sv);                         \
 559     Simple_vFAIL4(m, a1, a2, a3);                       \
 560 } STMT_END
 561
 562 /* m is not necessarily a "literal string", in this macro */
 563 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 564     const IV offset = loc - RExC_precomp;                               \
 565     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 566             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 567 } STMT_END
 568
 569 #define ckWARNreg(loc,m) STMT_START {                                   \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 572             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 573 } STMT_END
 574
 575 #define vWARN_dep(loc, m) STMT_START {                                  \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 578             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 579 } STMT_END
 580
 581 #define ckWARNdep(loc,m) STMT_START {                                   \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 584             m REPORT_LOCATION,                                          \
 585             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 586 } STMT_END
 587
 588 #define ckWARNregdep(loc,m) STMT_START {                                \
 589     const IV offset = loc - RExC_precomp;                               \
 590     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 591             m REPORT_LOCATION,                                          \
 592             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 593 } STMT_END
 594
 595 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 596     const IV offset = loc - RExC_precomp;                               \
 597     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 598             m REPORT_LOCATION,                                          \
 599             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 600 } STMT_END
 601
 602 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 603     const IV offset = loc - RExC_precomp;                               \
 604     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 605             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 606 } STMT_END
 607
 608 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 611             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 612 } STMT_END
 613
 614 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 615     const IV offset = loc - RExC_precomp;                               \
 616     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 617             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 618 } STMT_END
 619
 620 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 621     const IV offset = loc - RExC_precomp;                               \
 622     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 623             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 624 } STMT_END
 625
 626 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 627     const IV offset = loc - RExC_precomp;                               \
 628     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 629             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 630 } STMT_END
 631
 632 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 633     const IV offset = loc - RExC_precomp;                               \
 634     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 635             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 636 } STMT_END
 637
 638
 639 /* Allow for side effects in s */
 640 #define REGC(c,s) STMT_START {                  \
 641     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 642 } STMT_END
 643
 644 /* Macros for recording node offsets.   20001227 mjd@plover.com
 645  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 646  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 647  * Element 0 holds the number n.
 648  * Position is 1 indexed.
 649  */
 650 #ifndef RE_TRACK_PATTERN_OFFSETS
 651 #define Set_Node_Offset_To_R(node,byte)
 652 #define Set_Node_Offset(node,byte)
 653 #define Set_Cur_Node_Offset
 654 #define Set_Node_Length_To_R(node,len)
 655 #define Set_Node_Length(node,len)
 656 #define Set_Node_Cur_Length(node,start)
 657 #define Node_Offset(n)
 658 #define Node_Length(n)
 659 #define Set_Node_Offset_Length(node,offset,len)
 660 #define ProgLen(ri) ri->u.proglen
 661 #define SetProgLen(ri,x) ri->u.proglen = x
 662 #else
 663 #define ProgLen(ri) ri->u.offsets[0]
 664 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 665 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 666     if (! SIZE_ONLY) {                                                  \
 667         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 668                     __LINE__, (int)(node), (int)(byte)));               \
 669         if((node) < 0) {                                                \
 670             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 671         } else {                                                        \
 672             RExC_offsets[2*(node)-1] = (byte);                          \
 673         }                                                               \
 674     }                                                                   \
 675 } STMT_END
 676
 677 #define Set_Node_Offset(node,byte) \
 678     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 679 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 680
 681 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 682     if (! SIZE_ONLY) {                                                  \
 683         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 684                 __LINE__, (int)(node), (int)(len)));                    \
 685         if((node) < 0) {                                                \
 686             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 687         } else {                                                        \
 688             RExC_offsets[2*(node)] = (len);                             \
 689         }                                                               \
 690     }                                                                   \
 691 } STMT_END
 692
 693 #define Set_Node_Length(node,len) \
 694     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 695 #define Set_Node_Cur_Length(node, start)                \
 696     Set_Node_Length(node, RExC_parse - start)
 697
 698 /* Get offsets and lengths */
 699 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 700 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 701
 702 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 703     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 704     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 705 } STMT_END
 706 #endif
 707
 708 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 709 #define EXPERIMENTAL_INPLACESCAN
 710 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 711
 712 #define DEBUG_STUDYDATA(str,data,depth)                              \
 713 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 714     PerlIO_printf(Perl_debug_log,                                    \
 715         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 716         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 717         (int)(depth)*2, "",                                          \
 718         (IV)((data)->pos_min),                                       \
 719         (IV)((data)->pos_delta),                                     \
 720         (UV)((data)->flags),                                         \
 721         (IV)((data)->whilem_c),                                      \
 722         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 723         is_inf ? "INF " : ""                                         \
 724     );                                                               \
 725     if ((data)->last_found)                                          \
 726         PerlIO_printf(Perl_debug_log,                                \
 727             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 728             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 729             SvPVX_const((data)->last_found),                         \
 730             (IV)((data)->last_end),                                  \
 731             (IV)((data)->last_start_min),                            \
 732             (IV)((data)->last_start_max),                            \
 733             ((data)->longest &&                                      \
 734              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 735             SvPVX_const((data)->longest_fixed),                      \
 736             (IV)((data)->offset_fixed),                              \
 737             ((data)->longest &&                                      \
 738              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 739             SvPVX_const((data)->longest_float),                      \
 740             (IV)((data)->offset_float_min),                          \
 741             (IV)((data)->offset_float_max)                           \
 742         );                                                           \
 743     PerlIO_printf(Perl_debug_log,"\n");                              \
 744 });
 745
 746 /* Mark that we cannot extend a found fixed substring at this point.
 747    Update the longest found anchored substring and the longest found
 748    floating substrings if needed. */
 749
 750 STATIC void
 751 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 752                     SSize_t *minlenp, int is_inf)
 753 {
 754     const STRLEN l = CHR_SVLEN(data->last_found);
 755     const STRLEN old_l = CHR_SVLEN(*data->longest);
 756     GET_RE_DEBUG_FLAGS_DECL;
 757
 758     PERL_ARGS_ASSERT_SCAN_COMMIT;
 759
 760     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 761         SvSetMagicSV(*data->longest, data->last_found);
 762         if (*data->longest == data->longest_fixed) {
 763             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 764             if (data->flags & SF_BEFORE_EOL)
 765                 data->flags
 766                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 767             else
 768                 data->flags &= ~SF_FIX_BEFORE_EOL;
 769             data->minlen_fixed=minlenp;
 770             data->lookbehind_fixed=0;
 771         }
 772         else { /* *data->longest == data->longest_float */
 773             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 774             data->offset_float_max = (l
 775                                       ? data->last_start_max
 776                                       : (data->pos_delta == SSize_t_MAX
 777                                          ? SSize_t_MAX
 778                                          : data->pos_min + data->pos_delta));
 779             if (is_inf
 780                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 781                 data->offset_float_max = SSize_t_MAX;
 782             if (data->flags & SF_BEFORE_EOL)
 783                 data->flags
 784                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 785             else
 786                 data->flags &= ~SF_FL_BEFORE_EOL;
 787             data->minlen_float=minlenp;
 788             data->lookbehind_float=0;
 789         }
 790     }
 791     SvCUR_set(data->last_found, 0);
 792     {
 793         SV * const sv = data->last_found;
 794         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 795             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 796             if (mg)
 797                 mg->mg_len = 0;
 798         }
 799     }
 800     data->last_end = -1;
 801     data->flags &= ~SF_BEFORE_EOL;
 802     DEBUG_STUDYDATA("commit: ",data,0);
 803 }
 804
 805 /* These macros set, clear and test whether the synthetic start class ('ssc',
 806  * given by the parameter) matches an empty string (EOS).  This uses the
 807  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 808  * stands alone, so there is never a next_off, so this field is otherwise
 809  * unused.  The EOS information is used only for compilation, but theoretically
 810  * it could be passed on to the execution code.  This could be used to store
 811  * more than one bit of information, but only this one is currently used. */
 812 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 813 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 814 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 815
 816 /* Can match anything (initialization) */
 817 STATIC void
 818 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 819 {
 820     PERL_ARGS_ASSERT_CL_ANYTHING;
 821
 822     ANYOF_BITMAP_SETALL(cl);
 823     cl->flags = ANYOF_UNICODE_ALL;
 824     SET_SSC_EOS(cl);
 825
 826     /* If any portion of the regex is to operate under locale rules,
 827      * initialization includes it.  The reason this isn't done for all regexes
 828      * is that the optimizer was written under the assumption that locale was
 829      * all-or-nothing.  Given the complexity and lack of documentation in the
 830      * optimizer, and that there are inadequate test cases for locale, so many
 831      * parts of it may not work properly, it is safest to avoid locale unless
 832      * necessary. */
 833     if (RExC_contains_locale) {
 834         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 835         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 836     }
 837     else {
 838         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 839     }
 840 }
 841
 842 /* Can match anything (initialization) */
 843 STATIC int
 844 S_cl_is_anything(const struct regnode_charclass_class *cl)
 845 {
 846     int value;
 847
 848     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 849
 850     for (value = 0; value < ANYOF_MAX; value += 2)
 851         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 852             return 1;
 853     if (!(cl->flags & ANYOF_UNICODE_ALL))
 854         return 0;
 855     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 856         return 0;
 857     return 1;
 858 }
 859
 860 /* Can match anything (initialization) */
 861 STATIC void
 862 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 863 {
 864     PERL_ARGS_ASSERT_CL_INIT;
 865
 866     Zero(cl, 1, struct regnode_charclass_class);
 867     cl->type = ANYOF;
 868     cl_anything(pRExC_state, cl);
 869     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 870 }
 871
 872 /* These two functions currently do the exact same thing */
 873 #define cl_init_zero            cl_init
 874
 875 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 876  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 877  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 878 STATIC void
 879 S_cl_and(struct regnode_charclass_class *cl,
 880         const struct regnode_charclass_class *and_with)
 881 {
 882     PERL_ARGS_ASSERT_CL_AND;
 883
 884     assert(PL_regkind[and_with->type] == ANYOF);
 885
 886     /* I (khw) am not sure all these restrictions are necessary XXX */
 887     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 888         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 889         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 890         && !(and_with->flags & ANYOF_LOC_FOLD)
 891         && !(cl->flags & ANYOF_LOC_FOLD)) {
 892         int i;
 893
 894         if (and_with->flags & ANYOF_INVERT)
 895             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 896                 cl->bitmap[i] &= ~and_with->bitmap[i];
 897         else
 898             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 899                 cl->bitmap[i] &= and_with->bitmap[i];
 900     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 901
 902     if (and_with->flags & ANYOF_INVERT) {
 903
 904         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 905          * aren't affected by the inversion.  Those that are affected are
 906          * handled individually below */
 907         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 908         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 909         cl->flags |= affected_flags;
 910
 911         /* We currently don't know how to deal with things that aren't in the
 912          * bitmap, but we know that the intersection is no greater than what
 913          * is already in cl, so let there be false positives that get sorted
 914          * out after the synthetic start class succeeds, and the node is
 915          * matched for real. */
 916
 917         /* The inversion of these two flags indicate that the resulting
 918          * intersection doesn't have them */
 919         if (and_with->flags & ANYOF_UNICODE_ALL) {
 920             cl->flags &= ~ANYOF_UNICODE_ALL;
 921         }
 922         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 923             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 924         }
 925     }
 926     else {   /* and'd node is not inverted */
 927         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 928
 929         if (! ANYOF_NONBITMAP(and_with)) {
 930
 931             /* Here 'and_with' doesn't match anything outside the bitmap
 932              * (except possibly ANYOF_UNICODE_ALL), which means the
 933              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 934              * which case we don't know what the intersection is, but it's no
 935              * greater than what cl already has, so can just leave it alone,
 936              * with possible false positives */
 937             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 938                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 939                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 940             }
 941         }
 942         else if (! ANYOF_NONBITMAP(cl)) {
 943
 944             /* Here, 'and_with' does match something outside the bitmap, and cl
 945              * doesn't have a list of things to match outside the bitmap.  If
 946              * cl can match all code points above 255, the intersection will
 947              * be those above-255 code points that 'and_with' matches.  If cl
 948              * can't match all Unicode code points, it means that it can't
 949              * match anything outside the bitmap (since the 'if' that got us
 950              * into this block tested for that), so we leave the bitmap empty.
 951              */
 952             if (cl->flags & ANYOF_UNICODE_ALL) {
 953                 ARG_SET(cl, ARG(and_with));
 954
 955                 /* and_with's ARG may match things that don't require UTF8.
 956                  * And now cl's will too, in spite of this being an 'and'.  See
 957                  * the comments below about the kludge */
 958                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 959             }
 960         }
 961         else {
 962             /* Here, both 'and_with' and cl match something outside the
 963              * bitmap.  Currently we do not do the intersection, so just match
 964              * whatever cl had at the beginning.  */
 965         }
 966
 967
 968         /* Take the intersection of the two sets of flags.  However, the
 969          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 970          * kludge around the fact that this flag is not treated like the others
 971          * which are initialized in cl_anything().  The way the optimizer works
 972          * is that the synthetic start class (SSC) is initialized to match
 973          * anything, and then the first time a real node is encountered, its
 974          * values are AND'd with the SSC's with the result being the values of
 975          * the real node.  However, there are paths through the optimizer where
 976          * the AND never gets called, so those initialized bits are set
 977          * inappropriately, which is not usually a big deal, as they just cause
 978          * false positives in the SSC, which will just mean a probably
 979          * imperceptible slow down in execution.  However this bit has a
 980          * higher false positive consequence in that it can cause utf8.pm,
 981          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 982          * bigger slowdown and also causes significant extra memory to be used.
 983          * In order to prevent this, the code now takes a different tack.  The
 984          * bit isn't set unless some part of the regular expression needs it,
 985          * but once set it won't get cleared.  This means that these extra
 986          * modules won't get loaded unless there was some path through the
 987          * pattern that would have required them anyway, and  so any false
 988          * positives that occur by not ANDing them out when they could be
 989          * aren't as severe as they would be if we treated this bit like all
 990          * the others */
 991         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 992                                       & ANYOF_NONBITMAP_NON_UTF8;
 993         cl->flags &= and_with->flags;
 994         cl->flags |= outside_bitmap_but_not_utf8;
 995     }
 996 }
 997
 998 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 999  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
1000  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1001 STATIC void
1002 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1003 {
1004     PERL_ARGS_ASSERT_CL_OR;
1005
1006     if (or_with->flags & ANYOF_INVERT) {
1007
1008         /* Here, the or'd node is to be inverted.  This means we take the
1009          * complement of everything not in the bitmap, but currently we don't
1010          * know what that is, so give up and match anything */
1011         if (ANYOF_NONBITMAP(or_with)) {
1012             cl_anything(pRExC_state, cl);
1013         }
1014         /* We do not use
1015          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1016          *   <= (B1 | !B2) | (CL1 | !CL2)
1017          * which is wasteful if CL2 is small, but we ignore CL2:
1018          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1019          * XXXX Can we handle case-fold?  Unclear:
1020          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1021          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1022          */
1023         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024              && !(or_with->flags & ANYOF_LOC_FOLD)
1025              && !(cl->flags & ANYOF_LOC_FOLD) ) {
1026             int i;
1027
1028             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1029                 cl->bitmap[i] |= ~or_with->bitmap[i];
1030         } /* XXXX: logic is complicated otherwise */
1031         else {
1032             cl_anything(pRExC_state, cl);
1033         }
1034
1035         /* And, we can just take the union of the flags that aren't affected
1036          * by the inversion */
1037         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1038
1039         /* For the remaining flags:
1040             ANYOF_UNICODE_ALL and inverted means to not match anything above
1041                     255, which means that the union with cl should just be
1042                     what cl has in it, so can ignore this flag
1043             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1044                     is (ASCII) 127-255 to match them, but then invert that, so
1045                     the union with cl should just be what cl has in it, so can
1046                     ignore this flag
1047          */
1048     } else {    /* 'or_with' is not inverted */
1049         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1050         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1051              && (!(or_with->flags & ANYOF_LOC_FOLD)
1052                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1053             int i;
1054
1055             /* OR char bitmap and class bitmap separately */
1056             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1057                 cl->bitmap[i] |= or_with->bitmap[i];
1058             if (or_with->flags & ANYOF_CLASS) {
1059                 ANYOF_CLASS_OR(or_with, cl);
1060             }
1061         }
1062         else { /* XXXX: logic is complicated, leave it along for a moment. */
1063             cl_anything(pRExC_state, cl);
1064         }
1065
1066         if (ANYOF_NONBITMAP(or_with)) {
1067
1068             /* Use the added node's outside-the-bit-map match if there isn't a
1069              * conflict.  If there is a conflict (both nodes match something
1070              * outside the bitmap, but what they match outside is not the same
1071              * pointer, and hence not easily compared until XXX we extend
1072              * inversion lists this far), give up and allow the start class to
1073              * match everything outside the bitmap.  If that stuff is all above
1074              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1075             if (! ANYOF_NONBITMAP(cl)) {
1076                 ARG_SET(cl, ARG(or_with));
1077             }
1078             else if (ARG(cl) != ARG(or_with)) {
1079
1080                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1081                     cl_anything(pRExC_state, cl);
1082                 }
1083                 else {
1084                     cl->flags |= ANYOF_UNICODE_ALL;
1085                 }
1086             }
1087         }
1088
1089         /* Take the union */
1090         cl->flags |= or_with->flags;
1091     }
1092 }
1093
1094 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1095 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1096 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1097 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1098
1099
1100 #ifdef DEBUGGING
1101 /*
1102    dump_trie(trie,widecharmap,revcharmap)
1103    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1104    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1105
1106    These routines dump out a trie in a somewhat readable format.
1107    The _interim_ variants are used for debugging the interim
1108    tables that are used to generate the final compressed
1109    representation which is what dump_trie expects.
1110
1111    Part of the reason for their existence is to provide a form
1112    of documentation as to how the different representations function.
1113
1114 */
1115
1116 /*
1117   Dumps the final compressed table form of the trie to Perl_debug_log.
1118   Used for debugging make_trie().
1119 */
1120
1121 STATIC void
1122 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1123             AV *revcharmap, U32 depth)
1124 {
1125     U32 state;
1126     SV *sv=sv_newmortal();
1127     int colwidth= widecharmap ? 6 : 4;
1128     U16 word;
1129     GET_RE_DEBUG_FLAGS_DECL;
1130
1131     PERL_ARGS_ASSERT_DUMP_TRIE;
1132
1133     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1134         (int)depth * 2 + 2,"",
1135         "Match","Base","Ofs" );
1136
1137     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1138         SV ** const tmp = av_fetch( revcharmap, state, 0);
1139         if ( tmp ) {
1140             PerlIO_printf( Perl_debug_log, "%*s",
1141                 colwidth,
1142                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1143                             PL_colors[0], PL_colors[1],
1144                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1145                             PERL_PV_ESCAPE_FIRSTCHAR
1146                 )
1147             );
1148         }
1149     }
1150     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1151         (int)depth * 2 + 2,"");
1152
1153     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1154         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1155     PerlIO_printf( Perl_debug_log, "\n");
1156
1157     for( state = 1 ; state < trie->statecount ; state++ ) {
1158         const U32 base = trie->states[ state ].trans.base;
1159
1160         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1161
1162         if ( trie->states[ state ].wordnum ) {
1163             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1164         } else {
1165             PerlIO_printf( Perl_debug_log, "%6s", "" );
1166         }
1167
1168         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1169
1170         if ( base ) {
1171             U32 ofs = 0;
1172
1173             while( ( base + ofs  < trie->uniquecharcount ) ||
1174                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1175                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1176                     ofs++;
1177
1178             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1179
1180             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1181                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1182                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1183                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1184                 {
1185                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1186                     colwidth,
1187                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1188                 } else {
1189                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1190                 }
1191             }
1192
1193             PerlIO_printf( Perl_debug_log, "]");
1194
1195         }
1196         PerlIO_printf( Perl_debug_log, "\n" );
1197     }
1198     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1199     for (word=1; word <= trie->wordcount; word++) {
1200         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1201             (int)word, (int)(trie->wordinfo[word].prev),
1202             (int)(trie->wordinfo[word].len));
1203     }
1204     PerlIO_printf(Perl_debug_log, "\n" );
1205 }
1206 /*
1207   Dumps a fully constructed but uncompressed trie in list form.
1208   List tries normally only are used for construction when the number of
1209   possible chars (trie->uniquecharcount) is very high.
1210   Used for debugging make_trie().
1211 */
1212 STATIC void
1213 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1214                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215                          U32 depth)
1216 {
1217     U32 state;
1218     SV *sv=sv_newmortal();
1219     int colwidth= widecharmap ? 6 : 4;
1220     GET_RE_DEBUG_FLAGS_DECL;
1221
1222     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1223
1224     /* print out the table precompression.  */
1225     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1226         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1227         "------:-----+-----------------\n" );
1228
1229     for( state=1 ; state < next_alloc ; state ++ ) {
1230         U16 charid;
1231
1232         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1233             (int)depth * 2 + 2,"", (UV)state  );
1234         if ( ! trie->states[ state ].wordnum ) {
1235             PerlIO_printf( Perl_debug_log, "%5s| ","");
1236         } else {
1237             PerlIO_printf( Perl_debug_log, "W%4x| ",
1238                 trie->states[ state ].wordnum
1239             );
1240         }
1241         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1242             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1243             if ( tmp ) {
1244                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1245                     colwidth,
1246                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1247                             PL_colors[0], PL_colors[1],
1248                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1249                             PERL_PV_ESCAPE_FIRSTCHAR
1250                     ) ,
1251                     TRIE_LIST_ITEM(state,charid).forid,
1252                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1253                 );
1254                 if (!(charid % 10))
1255                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1256                         (int)((depth * 2) + 14), "");
1257             }
1258         }
1259         PerlIO_printf( Perl_debug_log, "\n");
1260     }
1261 }
1262
1263 /*
1264   Dumps a fully constructed but uncompressed trie in table form.
1265   This is the normal DFA style state transition table, with a few
1266   twists to facilitate compression later.
1267   Used for debugging make_trie().
1268 */
1269 STATIC void
1270 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1271                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1272                           U32 depth)
1273 {
1274     U32 state;
1275     U16 charid;
1276     SV *sv=sv_newmortal();
1277     int colwidth= widecharmap ? 6 : 4;
1278     GET_RE_DEBUG_FLAGS_DECL;
1279
1280     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1281
1282     /*
1283        print out the table precompression so that we can do a visual check
1284        that they are identical.
1285      */
1286
1287     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1288
1289     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1290         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1291         if ( tmp ) {
1292             PerlIO_printf( Perl_debug_log, "%*s",
1293                 colwidth,
1294                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1295                             PL_colors[0], PL_colors[1],
1296                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1297                             PERL_PV_ESCAPE_FIRSTCHAR
1298                 )
1299             );
1300         }
1301     }
1302
1303     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1304
1305     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1306         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1307     }
1308
1309     PerlIO_printf( Perl_debug_log, "\n" );
1310
1311     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1312
1313         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1314             (int)depth * 2 + 2,"",
1315             (UV)TRIE_NODENUM( state ) );
1316
1317         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1318             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1319             if (v)
1320                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1321             else
1322                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1323         }
1324         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1325             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1326         } else {
1327             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1328             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1329         }
1330     }
1331 }
1332
1333 #endif
1334
1335
1336 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1337   startbranch: the first branch in the whole branch sequence
1338   first      : start branch of sequence of branch-exact nodes.
1339                May be the same as startbranch
1340   last       : Thing following the last branch.
1341                May be the same as tail.
1342   tail       : item following the branch sequence
1343   count      : words in the sequence
1344   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1345   depth      : indent depth
1346
1347 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1348
1349 A trie is an N'ary tree where the branches are determined by digital
1350 decomposition of the key. IE, at the root node you look up the 1st character and
1351 follow that branch repeat until you find the end of the branches. Nodes can be
1352 marked as "accepting" meaning they represent a complete word. Eg:
1353
1354   /he|she|his|hers/
1355
1356 would convert into the following structure. Numbers represent states, letters
1357 following numbers represent valid transitions on the letter from that state, if
1358 the number is in square brackets it represents an accepting state, otherwise it
1359 will be in parenthesis.
1360
1361       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1362       |    |
1363       |   (2)
1364       |    |
1365      (1)   +-i->(6)-+-s->[7]
1366       |
1367       +-s->(3)-+-h->(4)-+-e->[5]
1368
1369       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1370
1371 This shows that when matching against the string 'hers' we will begin at state 1
1372 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1373 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1374 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1375 single traverse. We store a mapping from accepting to state to which word was
1376 matched, and then when we have multiple possibilities we try to complete the
1377 rest of the regex in the order in which they occured in the alternation.
1378
1379 The only prior NFA like behaviour that would be changed by the TRIE support is
1380 the silent ignoring of duplicate alternations which are of the form:
1381
1382  / (DUPE|DUPE) X? (?{ ... }) Y /x
1383
1384 Thus EVAL blocks following a trie may be called a different number of times with
1385 and without the optimisation. With the optimisations dupes will be silently
1386 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1387 the following demonstrates:
1388
1389  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1390
1391 which prints out 'word' three times, but
1392
1393  'words'=~/(word|word|word)(?{ print $1 })S/
1394
1395 which doesnt print it out at all. This is due to other optimisations kicking in.
1396
1397 Example of what happens on a structural level:
1398
1399 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1400
1401    1: CURLYM[1] {1,32767}(18)
1402    5:   BRANCH(8)
1403    6:     EXACT <ac>(16)
1404    8:   BRANCH(11)
1405    9:     EXACT <ad>(16)
1406   11:   BRANCH(14)
1407   12:     EXACT <ab>(16)
1408   16:   SUCCEED(0)
1409   17:   NOTHING(18)
1410   18: END(0)
1411
1412 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1413 and should turn into:
1414
1415    1: CURLYM[1] {1,32767}(18)
1416    5:   TRIE(16)
1417         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1418           <ac>
1419           <ad>
1420           <ab>
1421   16:   SUCCEED(0)
1422   17:   NOTHING(18)
1423   18: END(0)
1424
1425 Cases where tail != last would be like /(?foo|bar)baz/:
1426
1427    1: BRANCH(4)
1428    2:   EXACT <foo>(8)
1429    4: BRANCH(7)
1430    5:   EXACT <bar>(8)
1431    7: TAIL(8)
1432    8: EXACT <baz>(10)
1433   10: END(0)
1434
1435 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1436 and would end up looking like:
1437
1438     1: TRIE(8)
1439       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1440         <foo>
1441         <bar>
1442    7: TAIL(8)
1443    8: EXACT <baz>(10)
1444   10: END(0)
1445
1446     d = uvchr_to_utf8_flags(d, uv, 0);
1447
1448 is the recommended Unicode-aware way of saying
1449
1450     *(d++) = uv;
1451 */
1452
1453 #define TRIE_STORE_REVCHAR(val)                                            \
1454     STMT_START {                                                           \
1455         if (UTF) {                                                         \
1456             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1457             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1458             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1459             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1460             SvPOK_on(zlopp);                                               \
1461             SvUTF8_on(zlopp);                                              \
1462             av_push(revcharmap, zlopp);                                    \
1463         } else {                                                           \
1464             char ooooff = (char)val;                                           \
1465             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1466         }                                                                  \
1467         } STMT_END
1468
1469 /* This gets the next character from the input, folding it if not already
1470  * folded. */
1471 #define TRIE_READ_CHAR STMT_START {                                           \
1472     wordlen++;                                                                \
1473     if ( UTF ) {                                                              \
1474         /* if it is UTF then it is either already folded, or does not need    \
1475          * folding */                                                         \
1476         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1477     }                                                                         \
1478     else if (folder == PL_fold_latin1) {                                      \
1479         /* This folder implies Unicode rules, which in the range expressible  \
1480          *  by not UTF is the lower case, with the two exceptions, one of     \
1481          *  which should have been taken care of before calling this */       \
1482         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1483         uvc = toLOWER_L1(*uc);                                                \
1484         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1485         len = 1;                                                              \
1486     } else {                                                                  \
1487         /* raw data, will be folded later if needed */                        \
1488         uvc = (U32)*uc;                                                       \
1489         len = 1;                                                              \
1490     }                                                                         \
1491 } STMT_END
1492
1493
1494
1495 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1496     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1497         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1498         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1499     }                                                           \
1500     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1501     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1502     TRIE_LIST_CUR( state )++;                                   \
1503 } STMT_END
1504
1505 #define TRIE_LIST_NEW(state) STMT_START {                       \
1506     Newxz( trie->states[ state ].trans.list,               \
1507         4, reg_trie_trans_le );                                 \
1508      TRIE_LIST_CUR( state ) = 1;                                \
1509      TRIE_LIST_LEN( state ) = 4;                                \
1510 } STMT_END
1511
1512 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1513     U16 dupe= trie->states[ state ].wordnum;                    \
1514     regnode * const noper_next = regnext( noper );              \
1515                                                                 \
1516     DEBUG_r({                                                   \
1517         /* store the word for dumping */                        \
1518         SV* tmp;                                                \
1519         if (OP(noper) != NOTHING)                               \
1520             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1521         else                                                    \
1522             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1523         av_push( trie_words, tmp );                             \
1524     });                                                         \
1525                                                                 \
1526     curword++;                                                  \
1527     trie->wordinfo[curword].prev   = 0;                         \
1528     trie->wordinfo[curword].len    = wordlen;                   \
1529     trie->wordinfo[curword].accept = state;                     \
1530                                                                 \
1531     if ( noper_next < tail ) {                                  \
1532         if (!trie->jump)                                        \
1533             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1534         trie->jump[curword] = (U16)(noper_next - convert);      \
1535         if (!jumper)                                            \
1536             jumper = noper_next;                                \
1537         if (!nextbranch)                                        \
1538             nextbranch= regnext(cur);                           \
1539     }                                                           \
1540                                                                 \
1541     if ( dupe ) {                                               \
1542         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1543         /* chain, so that when the bits of chain are later    */\
1544         /* linked together, the dups appear in the chain      */\
1545         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1546         trie->wordinfo[dupe].prev = curword;                    \
1547     } else {                                                    \
1548         /* we haven't inserted this word yet.                */ \
1549         trie->states[ state ].wordnum = curword;                \
1550     }                                                           \
1551 } STMT_END
1552
1553
1554 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1555      ( ( base + charid >=  ucharcount                                   \
1556          && base + charid < ubound                                      \
1557          && state == trie->trans[ base - ucharcount + charid ].check    \
1558          && trie->trans[ base - ucharcount + charid ].next )            \
1559            ? trie->trans[ base - ucharcount + charid ].next             \
1560            : ( state==1 ? special : 0 )                                 \
1561       )
1562
1563 #define MADE_TRIE       1
1564 #define MADE_JUMP_TRIE  2
1565 #define MADE_EXACT_TRIE 4
1566
1567 STATIC I32
1568 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1569 {
1570     dVAR;
1571     /* first pass, loop through and scan words */
1572     reg_trie_data *trie;
1573     HV *widecharmap = NULL;
1574     AV *revcharmap = newAV();
1575     regnode *cur;
1576     STRLEN len = 0;
1577     UV uvc = 0;
1578     U16 curword = 0;
1579     U32 next_alloc = 0;
1580     regnode *jumper = NULL;
1581     regnode *nextbranch = NULL;
1582     regnode *convert = NULL;
1583     U32 *prev_states; /* temp array mapping each state to previous one */
1584     /* we just use folder as a flag in utf8 */
1585     const U8 * folder = NULL;
1586
1587 #ifdef DEBUGGING
1588     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1589     AV *trie_words = NULL;
1590     /* along with revcharmap, this only used during construction but both are
1591      * useful during debugging so we store them in the struct when debugging.
1592      */
1593 #else
1594     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1595     STRLEN trie_charcount=0;
1596 #endif
1597     SV *re_trie_maxbuff;
1598     GET_RE_DEBUG_FLAGS_DECL;
1599
1600     PERL_ARGS_ASSERT_MAKE_TRIE;
1601 #ifndef DEBUGGING
1602     PERL_UNUSED_ARG(depth);
1603 #endif
1604
1605     switch (flags) {
1606         case EXACT: break;
1607         case EXACTFA:
1608         case EXACTFU_SS:
1609         case EXACTFU: folder = PL_fold_latin1; break;
1610         case EXACTF:  folder = PL_fold; break;
1611         case EXACTFL: folder = PL_fold_locale; break;
1612         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1613     }
1614
1615     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1616     trie->refcount = 1;
1617     trie->startstate = 1;
1618     trie->wordcount = word_count;
1619     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1620     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1621     if (flags == EXACT)
1622         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1623     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1624                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1625
1626     DEBUG_r({
1627         trie_words = newAV();
1628     });
1629
1630     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1631     if (!SvIOK(re_trie_maxbuff)) {
1632         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1633     }
1634     DEBUG_TRIE_COMPILE_r({
1635                 PerlIO_printf( Perl_debug_log,
1636                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1637                   (int)depth * 2 + 2, "",
1638                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1639                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1640                   (int)depth);
1641     });
1642
1643    /* Find the node we are going to overwrite */
1644     if ( first == startbranch && OP( last ) != BRANCH ) {
1645         /* whole branch chain */
1646         convert = first;
1647     } else {
1648         /* branch sub-chain */
1649         convert = NEXTOPER( first );
1650     }
1651
1652     /*  -- First loop and Setup --
1653
1654        We first traverse the branches and scan each word to determine if it
1655        contains widechars, and how many unique chars there are, this is
1656        important as we have to build a table with at least as many columns as we
1657        have unique chars.
1658
1659        We use an array of integers to represent the character codes 0..255
1660        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1661        native representation of the character value as the key and IV's for the
1662        coded index.
1663
1664        *TODO* If we keep track of how many times each character is used we can
1665        remap the columns so that the table compression later on is more
1666        efficient in terms of memory by ensuring the most common value is in the
1667        middle and the least common are on the outside.  IMO this would be better
1668        than a most to least common mapping as theres a decent chance the most
1669        common letter will share a node with the least common, meaning the node
1670        will not be compressible. With a middle is most common approach the worst
1671        case is when we have the least common nodes twice.
1672
1673      */
1674
1675     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1676         regnode *noper = NEXTOPER( cur );
1677         const U8 *uc = (U8*)STRING( noper );
1678         const U8 *e  = uc + STR_LEN( noper );
1679         STRLEN foldlen = 0;
1680         U32 wordlen      = 0;         /* required init */
1681         STRLEN minbytes = 0;
1682         STRLEN maxbytes = 0;
1683         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1684
1685         if (OP(noper) == NOTHING) {
1686             regnode *noper_next= regnext(noper);
1687             if (noper_next != tail && OP(noper_next) == flags) {
1688                 noper = noper_next;
1689                 uc= (U8*)STRING(noper);
1690                 e= uc + STR_LEN(noper);
1691                 trie->minlen= STR_LEN(noper);
1692             } else {
1693                 trie->minlen= 0;
1694                 continue;
1695             }
1696         }
1697
1698         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1699             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1700                                           regardless of encoding */
1701             if (OP( noper ) == EXACTFU_SS) {
1702                 /* false positives are ok, so just set this */
1703                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1704             }
1705         }
1706         for ( ; uc < e ; uc += len ) {
1707             TRIE_CHARCOUNT(trie)++;
1708             TRIE_READ_CHAR;
1709
1710             /* Acummulate to the current values, the range in the number of
1711              * bytes that this character could match.  The max is presumed to
1712              * be the same as the folded input (which TRIE_READ_CHAR returns),
1713              * except that when this is not in UTF-8, it could be matched
1714              * against a string which is UTF-8, and the variant characters
1715              * could be 2 bytes instead of the 1 here.  Likewise, for the
1716              * minimum number of bytes when not folded.  When folding, the min
1717              * is assumed to be 1 byte could fold to match the single character
1718              * here, or in the case of a multi-char fold, 1 byte can fold to
1719              * the whole sequence.  'foldlen' is used to denote whether we are
1720              * in such a sequence, skipping the min setting if so.  XXX TODO
1721              * Use the exact list of what folds to each character, from
1722              * PL_utf8_foldclosures */
1723             if (UTF) {
1724                 maxbytes += UTF8SKIP(uc);
1725                 if (! folder) {
1726                     /* A non-UTF-8 string could be 1 byte to match our 2 */
1727                     minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1728                                 ? 1
1729                                 : UTF8SKIP(uc);
1730                 }
1731                 else {
1732                     if (foldlen) {
1733                         foldlen -= UTF8SKIP(uc);
1734                     }
1735                     else {
1736                         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1737                         minbytes++;
1738                     }
1739                 }
1740             }
1741             else {
1742                 maxbytes += (UNI_IS_INVARIANT(*uc))
1743                              ? 1
1744                              : 2;
1745                 if (! folder) {
1746                     minbytes++;
1747                 }
1748                 else {
1749                     if (foldlen) {
1750                         foldlen--;
1751                     }
1752                     else {
1753                         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1754                         minbytes++;
1755                     }
1756                 }
1757             }
1758             if ( uvc < 256 ) {
1759                 if ( folder ) {
1760                     U8 folded= folder[ (U8) uvc ];
1761                     if ( !trie->charmap[ folded ] ) {
1762                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1763                         TRIE_STORE_REVCHAR( folded );
1764                     }
1765                 }
1766                 if ( !trie->charmap[ uvc ] ) {
1767                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1768                     TRIE_STORE_REVCHAR( uvc );
1769                 }
1770                 if ( set_bit ) {
1771                     /* store the codepoint in the bitmap, and its folded
1772                      * equivalent. */
1773                     TRIE_BITMAP_SET(trie, uvc);
1774
1775                     /* store the folded codepoint */
1776                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1777
1778                     if ( !UTF ) {
1779                         /* store first byte of utf8 representation of
1780                            variant codepoints */
1781                         if (! NATIVE_IS_INVARIANT(uvc)) {
1782                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1783                         }
1784                     }
1785                     set_bit = 0; /* We've done our bit :-) */
1786                 }
1787             } else {
1788                 SV** svpp;
1789                 if ( !widecharmap )
1790                     widecharmap = newHV();
1791
1792                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1793
1794                 if ( !svpp )
1795                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1796
1797                 if ( !SvTRUE( *svpp ) ) {
1798                     sv_setiv( *svpp, ++trie->uniquecharcount );
1799                     TRIE_STORE_REVCHAR(uvc);
1800                 }
1801             }
1802         }
1803         if( cur == first ) {
1804             trie->minlen = minbytes;
1805             trie->maxlen = maxbytes;
1806         } else if (minbytes < trie->minlen) {
1807             trie->minlen = minbytes;
1808         } else if (maxbytes > trie->maxlen) {
1809             trie->maxlen = maxbytes;
1810         }
1811     } /* end first pass */
1812     DEBUG_TRIE_COMPILE_r(
1813         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1814                 (int)depth * 2 + 2,"",
1815                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1816                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1817                 (int)trie->minlen, (int)trie->maxlen )
1818     );
1819
1820     /*
1821         We now know what we are dealing with in terms of unique chars and
1822         string sizes so we can calculate how much memory a naive
1823         representation using a flat table  will take. If it's over a reasonable
1824         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1825         conservative but potentially much slower representation using an array
1826         of lists.
1827
1828         At the end we convert both representations into the same compressed
1829         form that will be used in regexec.c for matching with. The latter
1830         is a form that cannot be used to construct with but has memory
1831         properties similar to the list form and access properties similar
1832         to the table form making it both suitable for fast searches and
1833         small enough that its feasable to store for the duration of a program.
1834
1835         See the comment in the code where the compressed table is produced
1836         inplace from the flat tabe representation for an explanation of how
1837         the compression works.
1838
1839     */
1840
1841
1842     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1843     prev_states[1] = 0;
1844
1845     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1846         /*
1847             Second Pass -- Array Of Lists Representation
1848
1849             Each state will be represented by a list of charid:state records
1850             (reg_trie_trans_le) the first such element holds the CUR and LEN
1851             points of the allocated array. (See defines above).
1852
1853             We build the initial structure using the lists, and then convert
1854             it into the compressed table form which allows faster lookups
1855             (but cant be modified once converted).
1856         */
1857
1858         STRLEN transcount = 1;
1859
1860         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1861             "%*sCompiling trie using list compiler\n",
1862             (int)depth * 2 + 2, ""));
1863
1864         trie->states = (reg_trie_state *)
1865             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1866                                   sizeof(reg_trie_state) );
1867         TRIE_LIST_NEW(1);
1868         next_alloc = 2;
1869
1870         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1871
1872             regnode *noper   = NEXTOPER( cur );
1873             U8 *uc           = (U8*)STRING( noper );
1874             const U8 *e      = uc + STR_LEN( noper );
1875             U32 state        = 1;         /* required init */
1876             U16 charid       = 0;         /* sanity init */
1877             U32 wordlen      = 0;         /* required init */
1878
1879             if (OP(noper) == NOTHING) {
1880                 regnode *noper_next= regnext(noper);
1881                 if (noper_next != tail && OP(noper_next) == flags) {
1882                     noper = noper_next;
1883                     uc= (U8*)STRING(noper);
1884                     e= uc + STR_LEN(noper);
1885                 }
1886             }
1887
1888             if (OP(noper) != NOTHING) {
1889                 for ( ; uc < e ; uc += len ) {
1890
1891                     TRIE_READ_CHAR;
1892
1893                     if ( uvc < 256 ) {
1894                         charid = trie->charmap[ uvc ];
1895                     } else {
1896                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1897                         if ( !svpp ) {
1898                             charid = 0;
1899                         } else {
1900                             charid=(U16)SvIV( *svpp );
1901                         }
1902                     }
1903                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1904                     if ( charid ) {
1905
1906                         U16 check;
1907                         U32 newstate = 0;
1908
1909                         charid--;
1910                         if ( !trie->states[ state ].trans.list ) {
1911                             TRIE_LIST_NEW( state );
1912                         }
1913                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1914                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1915                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1916                                 break;
1917                             }
1918                         }
1919                         if ( ! newstate ) {
1920                             newstate = next_alloc++;
1921                             prev_states[newstate] = state;
1922                             TRIE_LIST_PUSH( state, charid, newstate );
1923                             transcount++;
1924                         }
1925                         state = newstate;
1926                     } else {
1927                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1928                     }
1929                 }
1930             }
1931             TRIE_HANDLE_WORD(state);
1932
1933         } /* end second pass */
1934
1935         /* next alloc is the NEXT state to be allocated */
1936         trie->statecount = next_alloc;
1937         trie->states = (reg_trie_state *)
1938             PerlMemShared_realloc( trie->states,
1939                                    next_alloc
1940                                    * sizeof(reg_trie_state) );
1941
1942         /* and now dump it out before we compress it */
1943         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1944                                                          revcharmap, next_alloc,
1945                                                          depth+1)
1946         );
1947
1948         trie->trans = (reg_trie_trans *)
1949             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1950         {
1951             U32 state;
1952             U32 tp = 0;
1953             U32 zp = 0;
1954
1955
1956             for( state=1 ; state < next_alloc ; state ++ ) {
1957                 U32 base=0;
1958
1959                 /*
1960                 DEBUG_TRIE_COMPILE_MORE_r(
1961                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1962                 );
1963                 */
1964
1965                 if (trie->states[state].trans.list) {
1966                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1967                     U16 maxid=minid;
1968                     U16 idx;
1969
1970                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1971                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1972                         if ( forid < minid ) {
1973                             minid=forid;
1974                         } else if ( forid > maxid ) {
1975                             maxid=forid;
1976                         }
1977                     }
1978                     if ( transcount < tp + maxid - minid + 1) {
1979                         transcount *= 2;
1980                         trie->trans = (reg_trie_trans *)
1981                             PerlMemShared_realloc( trie->trans,
1982                                                      transcount
1983                                                      * sizeof(reg_trie_trans) );
1984                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1985                     }
1986                     base = trie->uniquecharcount + tp - minid;
1987                     if ( maxid == minid ) {
1988                         U32 set = 0;
1989                         for ( ; zp < tp ; zp++ ) {
1990                             if ( ! trie->trans[ zp ].next ) {
1991                                 base = trie->uniquecharcount + zp - minid;
1992                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1993                                 trie->trans[ zp ].check = state;
1994                                 set = 1;
1995                                 break;
1996                             }
1997                         }
1998                         if ( !set ) {
1999                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2000                             trie->trans[ tp ].check = state;
2001                             tp++;
2002                             zp = tp;
2003                         }
2004                     } else {
2005                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2006                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2007                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2008                             trie->trans[ tid ].check = state;
2009                         }
2010                         tp += ( maxid - minid + 1 );
2011                     }
2012                     Safefree(trie->states[ state ].trans.list);
2013                 }
2014                 /*
2015                 DEBUG_TRIE_COMPILE_MORE_r(
2016                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2017                 );
2018                 */
2019                 trie->states[ state ].trans.base=base;
2020             }
2021             trie->lasttrans = tp + 1;
2022         }
2023     } else {
2024         /*
2025            Second Pass -- Flat Table Representation.
2026
2027            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2028            We know that we will need Charcount+1 trans at most to store the data
2029            (one row per char at worst case) So we preallocate both structures
2030            assuming worst case.
2031
2032            We then construct the trie using only the .next slots of the entry
2033            structs.
2034
2035            We use the .check field of the first entry of the node temporarily to
2036            make compression both faster and easier by keeping track of how many non
2037            zero fields are in the node.
2038
2039            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2040            transition.
2041
2042            There are two terms at use here: state as a TRIE_NODEIDX() which is a
2043            number representing the first entry of the node, and state as a
2044            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2045            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2046            are 2 entrys per node. eg:
2047
2048              A B       A B
2049           1. 2 4    1. 3 7
2050           2. 0 3    3. 0 5
2051           3. 0 0    5. 0 0
2052           4. 0 0    7. 0 0
2053
2054            The table is internally in the right hand, idx form. However as we also
2055            have to deal with the states array which is indexed by nodenum we have to
2056            use TRIE_NODENUM() to convert.
2057
2058         */
2059         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2060             "%*sCompiling trie using table compiler\n",
2061             (int)depth * 2 + 2, ""));
2062
2063         trie->trans = (reg_trie_trans *)
2064             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2065                                   * trie->uniquecharcount + 1,
2066                                   sizeof(reg_trie_trans) );
2067         trie->states = (reg_trie_state *)
2068             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2069                                   sizeof(reg_trie_state) );
2070         next_alloc = trie->uniquecharcount + 1;
2071
2072
2073         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2074
2075             regnode *noper   = NEXTOPER( cur );
2076             const U8 *uc     = (U8*)STRING( noper );
2077             const U8 *e      = uc + STR_LEN( noper );
2078
2079             U32 state        = 1;         /* required init */
2080
2081             U16 charid       = 0;         /* sanity init */
2082             U32 accept_state = 0;         /* sanity init */
2083
2084             U32 wordlen      = 0;         /* required init */
2085
2086             if (OP(noper) == NOTHING) {
2087                 regnode *noper_next= regnext(noper);
2088                 if (noper_next != tail && OP(noper_next) == flags) {
2089                     noper = noper_next;
2090                     uc= (U8*)STRING(noper);
2091                     e= uc + STR_LEN(noper);
2092                 }
2093             }
2094
2095             if ( OP(noper) != NOTHING ) {
2096                 for ( ; uc < e ; uc += len ) {
2097
2098                     TRIE_READ_CHAR;
2099
2100                     if ( uvc < 256 ) {
2101                         charid = trie->charmap[ uvc ];
2102                     } else {
2103                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2104                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2105                     }
2106                     if ( charid ) {
2107                         charid--;
2108                         if ( !trie->trans[ state + charid ].next ) {
2109                             trie->trans[ state + charid ].next = next_alloc;
2110                             trie->trans[ state ].check++;
2111                             prev_states[TRIE_NODENUM(next_alloc)]
2112                                     = TRIE_NODENUM(state);
2113                             next_alloc += trie->uniquecharcount;
2114                         }
2115                         state = trie->trans[ state + charid ].next;
2116                     } else {
2117                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2118                     }
2119                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2120                 }
2121             }
2122             accept_state = TRIE_NODENUM( state );
2123             TRIE_HANDLE_WORD(accept_state);
2124
2125         } /* end second pass */
2126
2127         /* and now dump it out before we compress it */
2128         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2129                                                           revcharmap,
2130                                                           next_alloc, depth+1));
2131
2132         {
2133         /*
2134            * Inplace compress the table.*
2135
2136            For sparse data sets the table constructed by the trie algorithm will
2137            be mostly 0/FAIL transitions or to put it another way mostly empty.
2138            (Note that leaf nodes will not contain any transitions.)
2139
2140            This algorithm compresses the tables by eliminating most such
2141            transitions, at the cost of a modest bit of extra work during lookup:
2142
2143            - Each states[] entry contains a .base field which indicates the
2144            index in the state[] array wheres its transition data is stored.
2145
2146            - If .base is 0 there are no valid transitions from that node.
2147
2148            - If .base is nonzero then charid is added to it to find an entry in
2149            the trans array.
2150
2151            -If trans[states[state].base+charid].check!=state then the
2152            transition is taken to be a 0/Fail transition. Thus if there are fail
2153            transitions at the front of the node then the .base offset will point
2154            somewhere inside the previous nodes data (or maybe even into a node
2155            even earlier), but the .check field determines if the transition is
2156            valid.
2157
2158            XXX - wrong maybe?
2159            The following process inplace converts the table to the compressed
2160            table: We first do not compress the root node 1,and mark all its
2161            .check pointers as 1 and set its .base pointer as 1 as well. This
2162            allows us to do a DFA construction from the compressed table later,
2163            and ensures that any .base pointers we calculate later are greater
2164            than 0.
2165
2166            - We set 'pos' to indicate the first entry of the second node.
2167
2168            - We then iterate over the columns of the node, finding the first and
2169            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2170            and set the .check pointers accordingly, and advance pos
2171            appropriately and repreat for the next node. Note that when we copy
2172            the next pointers we have to convert them from the original
2173            NODEIDX form to NODENUM form as the former is not valid post
2174            compression.
2175
2176            - If a node has no transitions used we mark its base as 0 and do not
2177            advance the pos pointer.
2178
2179            - If a node only has one transition we use a second pointer into the
2180            structure to fill in allocated fail transitions from other states.
2181            This pointer is independent of the main pointer and scans forward
2182            looking for null transitions that are allocated to a state. When it
2183            finds one it writes the single transition into the "hole".  If the
2184            pointer doesnt find one the single transition is appended as normal.
2185
2186            - Once compressed we can Renew/realloc the structures to release the
2187            excess space.
2188
2189            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2190            specifically Fig 3.47 and the associated pseudocode.
2191
2192            demq
2193         */
2194         const U32 laststate = TRIE_NODENUM( next_alloc );
2195         U32 state, charid;
2196         U32 pos = 0, zp=0;
2197         trie->statecount = laststate;
2198
2199         for ( state = 1 ; state < laststate ; state++ ) {
2200             U8 flag = 0;
2201             const U32 stateidx = TRIE_NODEIDX( state );
2202             const U32 o_used = trie->trans[ stateidx ].check;
2203             U32 used = trie->trans[ stateidx ].check;
2204             trie->trans[ stateidx ].check = 0;
2205
2206             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2207                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2208                     if ( trie->trans[ stateidx + charid ].next ) {
2209                         if (o_used == 1) {
2210                             for ( ; zp < pos ; zp++ ) {
2211                                 if ( ! trie->trans[ zp ].next ) {
2212                                     break;
2213                                 }
2214                             }
2215                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2216                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2217                             trie->trans[ zp ].check = state;
2218                             if ( ++zp > pos ) pos = zp;
2219                             break;
2220                         }
2221                         used--;
2222                     }
2223                     if ( !flag ) {
2224                         flag = 1;
2225                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2226                     }
2227                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2228                     trie->trans[ pos ].check = state;
2229                     pos++;
2230                 }
2231             }
2232         }
2233         trie->lasttrans = pos + 1;
2234         trie->states = (reg_trie_state *)
2235             PerlMemShared_realloc( trie->states, laststate
2236                                    * sizeof(reg_trie_state) );
2237         DEBUG_TRIE_COMPILE_MORE_r(
2238                 PerlIO_printf( Perl_debug_log,
2239                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2240                     (int)depth * 2 + 2,"",
2241                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2242                     (IV)next_alloc,
2243                     (IV)pos,
2244                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2245             );
2246
2247         } /* end table compress */
2248     }
2249     DEBUG_TRIE_COMPILE_MORE_r(
2250             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2251                 (int)depth * 2 + 2, "",
2252                 (UV)trie->statecount,
2253                 (UV)trie->lasttrans)
2254     );
2255     /* resize the trans array to remove unused space */
2256     trie->trans = (reg_trie_trans *)
2257         PerlMemShared_realloc( trie->trans, trie->lasttrans
2258                                * sizeof(reg_trie_trans) );
2259
2260     {   /* Modify the program and insert the new TRIE node */
2261         U8 nodetype =(U8)(flags & 0xFF);
2262         char *str=NULL;
2263
2264 #ifdef DEBUGGING
2265         regnode *optimize = NULL;
2266 #ifdef RE_TRACK_PATTERN_OFFSETS
2267
2268         U32 mjd_offset = 0;
2269         U32 mjd_nodelen = 0;
2270 #endif /* RE_TRACK_PATTERN_OFFSETS */
2271 #endif /* DEBUGGING */
2272         /*
2273            This means we convert either the first branch or the first Exact,
2274            depending on whether the thing following (in 'last') is a branch
2275            or not and whther first is the startbranch (ie is it a sub part of
2276            the alternation or is it the whole thing.)
2277            Assuming its a sub part we convert the EXACT otherwise we convert
2278            the whole branch sequence, including the first.
2279          */
2280         /* Find the node we are going to overwrite */
2281         if ( first != startbranch || OP( last ) == BRANCH ) {
2282             /* branch sub-chain */
2283             NEXT_OFF( first ) = (U16)(last - first);
2284 #ifdef RE_TRACK_PATTERN_OFFSETS
2285             DEBUG_r({
2286                 mjd_offset= Node_Offset((convert));
2287                 mjd_nodelen= Node_Length((convert));
2288             });
2289 #endif
2290             /* whole branch chain */
2291         }
2292 #ifdef RE_TRACK_PATTERN_OFFSETS
2293         else {
2294             DEBUG_r({
2295                 const  regnode *nop = NEXTOPER( convert );
2296                 mjd_offset= Node_Offset((nop));
2297                 mjd_nodelen= Node_Length((nop));
2298             });
2299         }
2300         DEBUG_OPTIMISE_r(
2301             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2302                 (int)depth * 2 + 2, "",
2303                 (UV)mjd_offset, (UV)mjd_nodelen)
2304         );
2305 #endif
2306         /* But first we check to see if there is a common prefix we can
2307            split out as an EXACT and put in front of the TRIE node.  */
2308         trie->startstate= 1;
2309         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2310             U32 state;
2311             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2312                 U32 ofs = 0;
2313                 I32 idx = -1;
2314                 U32 count = 0;
2315                 const U32 base = trie->states[ state ].trans.base;
2316
2317                 if ( trie->states[state].wordnum )
2318                         count = 1;
2319
2320                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2321                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2322                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2323                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2324                     {
2325                         if ( ++count > 1 ) {
2326                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2327                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2328                             if ( state == 1 ) break;
2329                             if ( count == 2 ) {
2330                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2331                                 DEBUG_OPTIMISE_r(
2332                                     PerlIO_printf(Perl_debug_log,
2333                                         "%*sNew Start State=%"UVuf" Class: [",
2334                                         (int)depth * 2 + 2, "",
2335                                         (UV)state));
2336                                 if (idx >= 0) {
2337                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2338                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2339
2340                                     TRIE_BITMAP_SET(trie,*ch);
2341                                     if ( folder )
2342                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2343                                     DEBUG_OPTIMISE_r(
2344                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2345                                     );
2346                                 }
2347                             }
2348                             TRIE_BITMAP_SET(trie,*ch);
2349                             if ( folder )
2350                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2351                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2352                         }
2353                         idx = ofs;
2354                     }
2355                 }
2356                 if ( count == 1 ) {
2357                     SV **tmp = av_fetch( revcharmap, idx, 0);
2358                     STRLEN len;
2359                     char *ch = SvPV( *tmp, len );
2360                     DEBUG_OPTIMISE_r({
2361                         SV *sv=sv_newmortal();
2362                         PerlIO_printf( Perl_debug_log,
2363                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2364                             (int)depth * 2 + 2, "",
2365                             (UV)state, (UV)idx,
2366                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2367                                 PL_colors[0], PL_colors[1],
2368                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2369                                 PERL_PV_ESCAPE_FIRSTCHAR
2370                             )
2371                         );
2372                     });
2373                     if ( state==1 ) {
2374                         OP( convert ) = nodetype;
2375                         str=STRING(convert);
2376                         STR_LEN(convert)=0;
2377                     }
2378                     STR_LEN(convert) += len;
2379                     while (len--)
2380                         *str++ = *ch++;
2381                 } else {
2382 #ifdef DEBUGGING
2383                     if (state>1)
2384                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2385 #endif
2386                     break;
2387                 }
2388             }
2389             trie->prefixlen = (state-1);
2390             if (str) {
2391                 regnode *n = convert+NODE_SZ_STR(convert);
2392                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2393                 trie->startstate = state;
2394                 trie->minlen -= (state - 1);
2395                 trie->maxlen -= (state - 1);
2396 #ifdef DEBUGGING
2397                /* At least the UNICOS C compiler choked on this
2398                 * being argument to DEBUG_r(), so let's just have
2399                 * it right here. */
2400                if (
2401 #ifdef PERL_EXT_RE_BUILD
2402                    1
2403 #else
2404                    DEBUG_r_TEST
2405 #endif
2406                    ) {
2407                    regnode *fix = convert;
2408                    U32 word = trie->wordcount;
2409                    mjd_nodelen++;
2410                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2411                    while( ++fix < n ) {
2412                        Set_Node_Offset_Length(fix, 0, 0);
2413                    }
2414                    while (word--) {
2415                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2416                        if (tmp) {
2417                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2418                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2419                            else
2420                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2421                        }
2422                    }
2423                }
2424 #endif
2425                 if (trie->maxlen) {
2426                     convert = n;
2427                 } else {
2428                     NEXT_OFF(convert) = (U16)(tail - convert);
2429                     DEBUG_r(optimize= n);
2430                 }
2431             }
2432         }
2433         if (!jumper)
2434             jumper = last;
2435         if ( trie->maxlen ) {
2436             NEXT_OFF( convert ) = (U16)(tail - convert);
2437             ARG_SET( convert, data_slot );
2438             /* Store the offset to the first unabsorbed branch in
2439                jump[0], which is otherwise unused by the jump logic.
2440                We use this when dumping a trie and during optimisation. */
2441             if (trie->jump)
2442                 trie->jump[0] = (U16)(nextbranch - convert);
2443
2444             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2445              *   and there is a bitmap
2446              *   and the first "jump target" node we found leaves enough room
2447              * then convert the TRIE node into a TRIEC node, with the bitmap
2448              * embedded inline in the opcode - this is hypothetically faster.
2449              */
2450             if ( !trie->states[trie->startstate].wordnum
2451                  && trie->bitmap
2452                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2453             {
2454                 OP( convert ) = TRIEC;
2455                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2456                 PerlMemShared_free(trie->bitmap);
2457                 trie->bitmap= NULL;
2458             } else
2459                 OP( convert ) = TRIE;
2460
2461             /* store the type in the flags */
2462             convert->flags = nodetype;
2463             DEBUG_r({
2464             optimize = convert
2465                       + NODE_STEP_REGNODE
2466                       + regarglen[ OP( convert ) ];
2467             });
2468             /* XXX We really should free up the resource in trie now,
2469                    as we won't use them - (which resources?) dmq */
2470         }
2471         /* needed for dumping*/
2472         DEBUG_r(if (optimize) {
2473             regnode *opt = convert;
2474
2475             while ( ++opt < optimize) {
2476                 Set_Node_Offset_Length(opt,0,0);
2477             }
2478             /*
2479                 Try to clean up some of the debris left after the
2480                 optimisation.
2481              */
2482             while( optimize < jumper ) {
2483                 mjd_nodelen += Node_Length((optimize));
2484                 OP( optimize ) = OPTIMIZED;
2485                 Set_Node_Offset_Length(optimize,0,0);
2486                 optimize++;
2487             }
2488             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2489         });
2490     } /* end node insert */
2491
2492     /*  Finish populating the prev field of the wordinfo array.  Walk back
2493      *  from each accept state until we find another accept state, and if
2494      *  so, point the first word's .prev field at the second word. If the
2495      *  second already has a .prev field set, stop now. This will be the
2496      *  case either if we've already processed that word's accept state,
2497      *  or that state had multiple words, and the overspill words were
2498      *  already linked up earlier.
2499      */
2500     {
2501         U16 word;
2502         U32 state;
2503         U16 prev;
2504
2505         for (word=1; word <= trie->wordcount; word++) {
2506             prev = 0;
2507             if (trie->wordinfo[word].prev)
2508                 continue;
2509             state = trie->wordinfo[word].accept;
2510             while (state) {
2511                 state = prev_states[state];
2512                 if (!state)
2513                     break;
2514                 prev = trie->states[state].wordnum;
2515                 if (prev)
2516                     break;
2517             }
2518             trie->wordinfo[word].prev = prev;
2519         }
2520         Safefree(prev_states);
2521     }
2522
2523
2524     /* and now dump out the compressed format */
2525     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2526
2527     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2528 #ifdef DEBUGGING
2529     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2530     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2531 #else
2532     SvREFCNT_dec_NN(revcharmap);
2533 #endif
2534     return trie->jump
2535            ? MADE_JUMP_TRIE
2536            : trie->startstate>1
2537              ? MADE_EXACT_TRIE
2538              : MADE_TRIE;
2539 }
2540
2541 STATIC void
2542 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2543 {
2544 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2545
2546    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2547    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2548    ISBN 0-201-10088-6
2549
2550    We find the fail state for each state in the trie, this state is the longest proper
2551    suffix of the current state's 'word' that is also a proper prefix of another word in our
2552    trie. State 1 represents the word '' and is thus the default fail state. This allows
2553    the DFA not to have to restart after its tried and failed a word at a given point, it
2554    simply continues as though it had been matching the other word in the first place.
2555    Consider
2556       'abcdgu'=~/abcdefg|cdgu/
2557    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2558    fail, which would bring us to the state representing 'd' in the second word where we would
2559    try 'g' and succeed, proceeding to match 'cdgu'.
2560  */
2561  /* add a fail transition */
2562     const U32 trie_offset = ARG(source);
2563     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2564     U32 *q;
2565     const U32 ucharcount = trie->uniquecharcount;
2566     const U32 numstates = trie->statecount;
2567     const U32 ubound = trie->lasttrans + ucharcount;
2568     U32 q_read = 0;
2569     U32 q_write = 0;
2570     U32 charid;
2571     U32 base = trie->states[ 1 ].trans.base;
2572     U32 *fail;
2573     reg_ac_data *aho;
2574     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2575     GET_RE_DEBUG_FLAGS_DECL;
2576
2577     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2578 #ifndef DEBUGGING
2579     PERL_UNUSED_ARG(depth);
2580 #endif
2581
2582
2583     ARG_SET( stclass, data_slot );
2584     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2585     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2586     aho->trie=trie_offset;
2587     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2588     Copy( trie->states, aho->states, numstates, reg_trie_state );
2589     Newxz( q, numstates, U32);
2590     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2591     aho->refcount = 1;
2592     fail = aho->fail;
2593     /* initialize fail[0..1] to be 1 so that we always have
2594        a valid final fail state */
2595     fail[ 0 ] = fail[ 1 ] = 1;
2596
2597     for ( charid = 0; charid < ucharcount ; charid++ ) {
2598         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2599         if ( newstate ) {
2600             q[ q_write ] = newstate;
2601             /* set to point at the root */
2602             fail[ q[ q_write++ ] ]=1;
2603         }
2604     }
2605     while ( q_read < q_write) {
2606         const U32 cur = q[ q_read++ % numstates ];
2607         base = trie->states[ cur ].trans.base;
2608
2609         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2610             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2611             if (ch_state) {
2612                 U32 fail_state = cur;
2613                 U32 fail_base;
2614                 do {
2615                     fail_state = fail[ fail_state ];
2616                     fail_base = aho->states[ fail_state ].trans.base;
2617                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2618
2619                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2620                 fail[ ch_state ] = fail_state;
2621                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2622                 {
2623                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2624                 }
2625                 q[ q_write++ % numstates] = ch_state;
2626             }
2627         }
2628     }
2629     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2630        when we fail in state 1, this allows us to use the
2631        charclass scan to find a valid start char. This is based on the principle
2632        that theres a good chance the string being searched contains lots of stuff
2633        that cant be a start char.
2634      */
2635     fail[ 0 ] = fail[ 1 ] = 0;
2636     DEBUG_TRIE_COMPILE_r({
2637         PerlIO_printf(Perl_debug_log,
2638                       "%*sStclass Failtable (%"UVuf" states): 0",
2639                       (int)(depth * 2), "", (UV)numstates
2640         );
2641         for( q_read=1; q_read<numstates; q_read++ ) {
2642             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2643         }
2644         PerlIO_printf(Perl_debug_log, "\n");
2645     });
2646     Safefree(q);
2647     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2648 }
2649
2650
2651 /*
2652  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2653  * These need to be revisited when a newer toolchain becomes available.
2654  */
2655 #if defined(__sparc64__) && defined(__GNUC__)
2656 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2657 #       undef  SPARC64_GCC_WORKAROUND
2658 #       define SPARC64_GCC_WORKAROUND 1
2659 #   endif
2660 #endif
2661
2662 #define DEBUG_PEEP(str,scan,depth) \
2663     DEBUG_OPTIMISE_r({if (scan){ \
2664        SV * const mysv=sv_newmortal(); \
2665        regnode *Next = regnext(scan); \
2666        regprop(RExC_rx, mysv, scan); \
2667        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2668        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2669        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2670    }});
2671
2672
2673 /* The below joins as many adjacent EXACTish nodes as possible into a single
2674  * one.  The regop may be changed if the node(s) contain certain sequences that
2675  * require special handling.  The joining is only done if:
2676  * 1) there is room in the current conglomerated node to entirely contain the
2677  *    next one.
2678  * 2) they are the exact same node type
2679  *
2680  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2681  * these get optimized out
2682  *
2683  * If a node is to match under /i (folded), the number of characters it matches
2684  * can be different than its character length if it contains a multi-character
2685  * fold.  *min_subtract is set to the total delta of the input nodes.
2686  *
2687  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2688  * and contains LATIN SMALL LETTER SHARP S
2689  *
2690  * This is as good a place as any to discuss the design of handling these
2691  * multi-character fold sequences.  It's been wrong in Perl for a very long
2692  * time.  There are three code points in Unicode whose multi-character folds
2693  * were long ago discovered to mess things up.  The previous designs for
2694  * dealing with these involved assigning a special node for them.  This
2695  * approach doesn't work, as evidenced by this example:
2696  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2697  * Both these fold to "sss", but if the pattern is parsed to create a node that
2698  * would match just the \xDF, it won't be able to handle the case where a
2699  * successful match would have to cross the node's boundary.  The new approach
2700  * that hopefully generally solves the problem generates an EXACTFU_SS node
2701  * that is "sss".
2702  *
2703  * It turns out that there are problems with all multi-character folds, and not
2704  * just these three.  Now the code is general, for all such cases.  The
2705  * approach taken is:
2706  * 1)   This routine examines each EXACTFish node that could contain multi-
2707  *      character fold sequences.  It returns in *min_subtract how much to
2708  *      subtract from the the actual length of the string to get a real minimum
2709  *      match length; it is 0 if there are no multi-char folds.  This delta is
2710  *      used by the caller to adjust the min length of the match, and the delta
2711  *      between min and max, so that the optimizer doesn't reject these
2712  *      possibilities based on size constraints.
2713  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2714  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2715  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2716  *      there is a possible fold length change.  That means that a regular
2717  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2718  *      with length changes, and so can be processed faster.  regexec.c takes
2719  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2720  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2721  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2722  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2723  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2724  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2725  *      possibilities for the non-UTF8 patterns are quite simple, except for
2726  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2727  *      members of a fold-pair, and arrays are set up for all of them so that
2728  *      the other member of the pair can be found quickly.  Code elsewhere in
2729  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2730  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2731  *      described in the next item.
2732  * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2733  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2734  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
2735  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
2736  *      character in the pattern corresponds to at most a single character in
2737  *      the target string.  (And I do mean character, and not byte here, unlike
2738  *      other parts of the documentation that have never been updated to
2739  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
2740  *      two character string 'ss'; in EXACTFA nodes it can match
2741  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
2742  *      instances where it is violated.  I'm reluctant to try to change the
2743  *      assumption, as the code involved is impenetrable to me (khw), so
2744  *      instead the code here punts.  This routine examines (when the pattern
2745  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2746  *      boolean indicating whether or not the node contains a sharp s.  When it
2747  *      is true, the caller sets a flag that later causes the optimizer in this
2748  *      file to not set values for the floating and fixed string lengths, and
2749  *      thus avoids the optimizer code in regexec.c that makes the invalid
2750  *      assumption.  Thus, there is no optimization based on string lengths for
2751  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2752  *      (The reason the assumption is wrong only in these two cases is that all
2753  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2754  *      other folds to their expanded versions.  We can't prefold sharp s to
2755  *      'ss' in EXACTF nodes because we don't know at compile time if it
2756  *      actually matches 'ss' or not.  It will match iff the target string is
2757  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2758  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
2759  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2760  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2761  *      require the pattern to be forced into UTF-8, the overhead of which we
2762  *      want to avoid.)
2763  *
2764  *      Similarly, the code that generates tries doesn't currently handle
2765  *      not-already-folded multi-char folds, and it looks like a pain to change
2766  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
2767  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
2768  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
2769  *      using /iaa matching will be doing so almost entirely with ASCII
2770  *      strings, so this should rarely be encountered in practice */
2771
2772 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2773     if (PL_regkind[OP(scan)] == EXACT) \
2774         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2775
2776 STATIC U32
2777 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2778     /* Merge several consecutive EXACTish nodes into one. */
2779     regnode *n = regnext(scan);
2780     U32 stringok = 1;
2781     regnode *next = scan + NODE_SZ_STR(scan);
2782     U32 merged = 0;
2783     U32 stopnow = 0;
2784 #ifdef DEBUGGING
2785     regnode *stop = scan;
2786     GET_RE_DEBUG_FLAGS_DECL;
2787 #else
2788     PERL_UNUSED_ARG(depth);
2789 #endif
2790
2791     PERL_ARGS_ASSERT_JOIN_EXACT;
2792 #ifndef EXPERIMENTAL_INPLACESCAN
2793     PERL_UNUSED_ARG(flags);
2794     PERL_UNUSED_ARG(val);
2795 #endif
2796     DEBUG_PEEP("join",scan,depth);
2797
2798     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2799      * EXACT ones that are mergeable to the current one. */
2800     while (n
2801            && (PL_regkind[OP(n)] == NOTHING
2802                || (stringok && OP(n) == OP(scan)))
2803            && NEXT_OFF(n)
2804            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2805     {
2806
2807         if (OP(n) == TAIL || n > next)
2808             stringok = 0;
2809         if (PL_regkind[OP(n)] == NOTHING) {
2810             DEBUG_PEEP("skip:",n,depth);
2811             NEXT_OFF(scan) += NEXT_OFF(n);
2812             next = n + NODE_STEP_REGNODE;
2813 #ifdef DEBUGGING
2814             if (stringok)
2815                 stop = n;
2816 #endif
2817             n = regnext(n);
2818         }
2819         else if (stringok) {
2820             const unsigned int oldl = STR_LEN(scan);
2821             regnode * const nnext = regnext(n);
2822
2823             /* XXX I (khw) kind of doubt that this works on platforms where
2824              * U8_MAX is above 255 because of lots of other assumptions */
2825             /* Don't join if the sum can't fit into a single node */
2826             if (oldl + STR_LEN(n) > U8_MAX)
2827                 break;
2828
2829             DEBUG_PEEP("merg",n,depth);
2830             merged++;
2831
2832             NEXT_OFF(scan) += NEXT_OFF(n);
2833             STR_LEN(scan) += STR_LEN(n);
2834             next = n + NODE_SZ_STR(n);
2835             /* Now we can overwrite *n : */
2836             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2837 #ifdef DEBUGGING
2838             stop = next - 1;
2839 #endif
2840             n = nnext;
2841             if (stopnow) break;
2842         }
2843
2844 #ifdef EXPERIMENTAL_INPLACESCAN
2845         if (flags && !NEXT_OFF(n)) {
2846             DEBUG_PEEP("atch", val, depth);
2847             if (reg_off_by_arg[OP(n)]) {
2848                 ARG_SET(n, val - n);
2849             }
2850             else {
2851                 NEXT_OFF(n) = val - n;
2852             }
2853             stopnow = 1;
2854         }
2855 #endif
2856     }
2857
2858     *min_subtract = 0;
2859     *has_exactf_sharp_s = FALSE;
2860
2861     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2862      * can now analyze for sequences of problematic code points.  (Prior to
2863      * this final joining, sequences could have been split over boundaries, and
2864      * hence missed).  The sequences only happen in folding, hence for any
2865      * non-EXACT EXACTish node */
2866     if (OP(scan) != EXACT) {
2867         const U8 * const s0 = (U8*) STRING(scan);
2868         const U8 * s = s0;
2869         const U8 * const s_end = s0 + STR_LEN(scan);
2870
2871         /* One pass is made over the node's string looking for all the
2872          * possibilities.  to avoid some tests in the loop, there are two main
2873          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2874          * non-UTF-8 */
2875         if (UTF) {
2876
2877             /* Examine the string for a multi-character fold sequence.  UTF-8
2878              * patterns have all characters pre-folded by the time this code is
2879              * executed */
2880             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2881                                      length sequence we are looking for is 2 */
2882             {
2883                 int count = 0;
2884                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2885                 if (! len) {    /* Not a multi-char fold: get next char */
2886                     s += UTF8SKIP(s);
2887                     continue;
2888                 }
2889
2890                 /* Nodes with 'ss' require special handling, except for EXACTFL
2891                  * and EXACTFA-ish for which there is no multi-char fold to
2892                  * this */
2893                 if (len == 2 && *s == 's' && *(s+1) == 's'
2894                     && OP(scan) != EXACTFL
2895                     && OP(scan) != EXACTFA
2896                     && OP(scan) != EXACTFA_NO_TRIE)
2897                 {
2898                     count = 2;
2899                     OP(scan) = EXACTFU_SS;
2900                     s += 2;
2901                 }
2902                 else { /* Here is a generic multi-char fold. */
2903                     const U8* multi_end  = s + len;
2904
2905                     /* Count how many characters in it.  In the case of /l and
2906                      * /aa, no folds which contain ASCII code points are
2907                      * allowed, so check for those, and skip if found.  (In
2908                      * EXACTFL, no folds are allowed to any Latin1 code point,
2909                      * not just ASCII.  But there aren't any of these
2910                      * currently, nor ever likely, so don't take the time to
2911                      * test for them.  The code that generates the
2912                      * is_MULTI_foo() macros croaks should one actually get put
2913                      * into Unicode .) */
2914                     if (OP(scan) != EXACTFL
2915                         && OP(scan) != EXACTFA
2916                         && OP(scan) != EXACTFA_NO_TRIE)
2917                     {
2918                         count = utf8_length(s, multi_end);
2919                         s = multi_end;
2920                     }
2921                     else {
2922                         while (s < multi_end) {
2923                             if (isASCII(*s)) {
2924                                 s++;
2925                                 goto next_iteration;
2926                             }
2927                             else {
2928                                 s += UTF8SKIP(s);
2929                             }
2930                             count++;
2931                         }
2932                     }
2933                 }
2934
2935                 /* The delta is how long the sequence is minus 1 (1 is how long
2936                  * the character that folds to the sequence is) */
2937                 *min_subtract += count - 1;
2938             next_iteration: ;
2939             }
2940         }
2941         else if (OP(scan) == EXACTFA) {
2942
2943             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
2944              * fold to the ASCII range (and there are no existing ones in the
2945              * upper latin1 range).  But, as outlined in the comments preceding
2946              * this function, we need to flag any occurrences of the sharp s.
2947              * This character forbids trie formation (because of added
2948              * complexity) */
2949             while (s < s_end) {
2950                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2951                     OP(scan) = EXACTFA_NO_TRIE;
2952                     *has_exactf_sharp_s = TRUE;
2953                     break;
2954                 }
2955                 s++;
2956                 continue;
2957             }
2958         }
2959         else if (OP(scan) != EXACTFL) {
2960
2961             /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
2962              * multi-char folds that are all Latin1.  (This code knows that
2963              * there are no current multi-char folds possible with EXACTFL,
2964              * relying on fold_grind.t to catch any errors if the very unlikely
2965              * event happens that some get added in future Unicode versions.)
2966              * As explained in the comments preceding this function, we look
2967              * also for the sharp s in EXACTF nodes; it can be in the final
2968              * position.  Otherwise we can stop looking 1 byte earlier because
2969              * have to find at least two characters for a multi-fold */
2970             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2971
2972             while (s < upper) {
2973                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2974                 if (! len) {    /* Not a multi-char fold. */
2975                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2976                     {
2977                         *has_exactf_sharp_s = TRUE;
2978                     }
2979                     s++;
2980                     continue;
2981                 }
2982
2983                 if (len == 2
2984                     && isARG2_lower_or_UPPER_ARG1('s', *s)
2985                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2986                 {
2987
2988                     /* EXACTF nodes need to know that the minimum length
2989                      * changed so that a sharp s in the string can match this
2990                      * ss in the pattern, but they remain EXACTF nodes, as they
2991                      * won't match this unless the target string is is UTF-8,
2992                      * which we don't know until runtime */
2993                     if (OP(scan) != EXACTF) {
2994                         OP(scan) = EXACTFU_SS;
2995                     }
2996                 }
2997
2998                 *min_subtract += len - 1;
2999                 s += len;
3000             }
3001         }
3002     }
3003
3004 #ifdef DEBUGGING
3005     /* Allow dumping but overwriting the collection of skipped
3006      * ops and/or strings with fake optimized ops */
3007     n = scan + NODE_SZ_STR(scan);
3008     while (n <= stop) {
3009         OP(n) = OPTIMIZED;
3010         FLAGS(n) = 0;
3011         NEXT_OFF(n) = 0;
3012         n++;
3013     }
3014 #endif
3015     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3016     return stopnow;
3017 }
3018
3019 /* REx optimizer.  Converts nodes into quicker variants "in place".
3020    Finds fixed substrings.  */
3021
3022 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3023    to the position after last scanned or to NULL. */
3024
3025 #define INIT_AND_WITHP \
3026     assert(!and_withp); \
3027     Newx(and_withp,1,struct regnode_charclass_class); \
3028     SAVEFREEPV(and_withp)
3029
3030 /* this is a chain of data about sub patterns we are processing that
3031    need to be handled separately/specially in study_chunk. Its so
3032    we can simulate recursion without losing state.  */
3033 struct scan_frame;
3034 typedef struct scan_frame {
3035     regnode *last;  /* last node to process in this frame */
3036     regnode *next;  /* next node to process when last is reached */
3037     struct scan_frame *prev; /*previous frame*/
3038     I32 stop; /* what stopparen do we use */
3039 } scan_frame;
3040
3041
3042 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3043
3044 STATIC SSize_t
3045 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3046                         SSize_t *minlenp, SSize_t *deltap,
3047                         regnode *last,
3048                         scan_data_t *data,
3049                         I32 stopparen,
3050                         U8* recursed,
3051                         struct regnode_charclass_class *and_withp,
3052                         U32 flags, U32 depth)
3053                         /* scanp: Start here (read-write). */
3054                         /* deltap: Write maxlen-minlen here. */
3055                         /* last: Stop before this one. */
3056                         /* data: string data about the pattern */
3057                         /* stopparen: treat close N as END */
3058                         /* recursed: which subroutines have we recursed into */
3059                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3060 {
3061     dVAR;
3062     /* There must be at least this number of characters to match */
3063     SSize_t min = 0;
3064     I32 pars = 0, code;
3065     regnode *scan = *scanp, *next;
3066     SSize_t delta = 0;
3067     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3068     int is_inf_internal = 0;            /* The studied chunk is infinite */
3069     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3070     scan_data_t data_fake;
3071     SV *re_trie_maxbuff = NULL;
3072     regnode *first_non_open = scan;
3073     SSize_t stopmin = SSize_t_MAX;
3074     scan_frame *frame = NULL;
3075     GET_RE_DEBUG_FLAGS_DECL;
3076
3077     PERL_ARGS_ASSERT_STUDY_CHUNK;
3078
3079 #ifdef DEBUGGING
3080     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3081 #endif
3082
3083     if ( depth == 0 ) {
3084         while (first_non_open && OP(first_non_open) == OPEN)
3085             first_non_open=regnext(first_non_open);
3086     }
3087
3088
3089   fake_study_recurse:
3090     while ( scan && OP(scan) != END && scan < last ){
3091         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3092                                    node length to get a real minimum (because
3093                                    the folded version may be shorter) */
3094         bool has_exactf_sharp_s = FALSE;
3095         /* Peephole optimizer: */
3096         DEBUG_STUDYDATA("Peep:", data,depth);
3097         DEBUG_PEEP("Peep",scan,depth);
3098
3099         /* Its not clear to khw or hv why this is done here, and not in the
3100          * clauses that deal with EXACT nodes.  khw's guess is that it's
3101          * because of a previous design */
3102         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3103
3104         /* Follow the next-chain of the current node and optimize
3105            away all the NOTHINGs from it.  */
3106         if (OP(scan) != CURLYX) {
3107             const int max = (reg_off_by_arg[OP(scan)]
3108                        ? I32_MAX
3109                        /* I32 may be smaller than U16 on CRAYs! */
3110                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3111             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3112             int noff;
3113             regnode *n = scan;
3114
3115             /* Skip NOTHING and LONGJMP. */
3116             while ((n = regnext(n))
3117                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3118                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3119                    && off + noff < max)
3120                 off += noff;
3121             if (reg_off_by_arg[OP(scan)])
3122                 ARG(scan) = off;
3123             else
3124                 NEXT_OFF(scan) = off;
3125         }
3126
3127
3128
3129         /* The principal pseudo-switch.  Cannot be a switch, since we
3130            look into several different things.  */
3131         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3132                    || OP(scan) == IFTHEN) {
3133             next = regnext(scan);
3134             code = OP(scan);
3135             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3136
3137             if (OP(next) == code || code == IFTHEN) {
3138                 /* NOTE - There is similar code to this block below for handling
3139                    TRIE nodes on a re-study.  If you change stuff here check there
3140                    too. */
3141                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3142                 struct regnode_charclass_class accum;
3143                 regnode * const startbranch=scan;
3144
3145                 if (flags & SCF_DO_SUBSTR)
3146                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3147                 if (flags & SCF_DO_STCLASS)
3148                     cl_init_zero(pRExC_state, &accum);
3149
3150                 while (OP(scan) == code) {
3151                     SSize_t deltanext, minnext, fake;
3152                     I32 f = 0;
3153                     struct regnode_charclass_class this_class;
3154
3155                     num++;
3156                     data_fake.flags = 0;
3157                     if (data) {
3158                         data_fake.whilem_c = data->whilem_c;
3159                         data_fake.last_closep = data->last_closep;
3160                     }
3161                     else
3162                         data_fake.last_closep = &fake;
3163
3164                     data_fake.pos_delta = delta;
3165                     next = regnext(scan);
3166                     scan = NEXTOPER(scan);
3167                     if (code != BRANCH)
3168                         scan = NEXTOPER(scan);
3169                     if (flags & SCF_DO_STCLASS) {
3170                         cl_init(pRExC_state, &this_class);
3171                         data_fake.start_class = &this_class;
3172                         f = SCF_DO_STCLASS_AND;
3173                     }
3174                     if (flags & SCF_WHILEM_VISITED_POS)
3175                         f |= SCF_WHILEM_VISITED_POS;
3176
3177                     /* we suppose the run is continuous, last=next...*/
3178                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3179                                           next, &data_fake,
3180                                           stopparen, recursed, NULL, f,depth+1);
3181                     if (min1 > minnext)
3182                         min1 = minnext;
3183                     if (deltanext == SSize_t_MAX) {
3184                         is_inf = is_inf_internal = 1;
3185                         max1 = SSize_t_MAX;
3186                     } else if (max1 < minnext + deltanext)
3187                         max1 = minnext + deltanext;
3188                     scan = next;
3189                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3190                         pars++;
3191                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3192                         if ( stopmin > minnext)
3193                             stopmin = min + min1;
3194                         flags &= ~SCF_DO_SUBSTR;
3195                         if (data)
3196                             data->flags |= SCF_SEEN_ACCEPT;
3197                     }
3198                     if (data) {
3199                         if (data_fake.flags & SF_HAS_EVAL)
3200                             data->flags |= SF_HAS_EVAL;
3201                         data->whilem_c = data_fake.whilem_c;
3202                     }
3203                     if (flags & SCF_DO_STCLASS)
3204                         cl_or(pRExC_state, &accum, &this_class);
3205                 }
3206                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3207                     min1 = 0;
3208                 if (flags & SCF_DO_SUBSTR) {
3209                     data->pos_min += min1;
3210                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3211                         data->pos_delta = SSize_t_MAX;
3212                     else
3213                         data->pos_delta += max1 - min1;
3214                     if (max1 != min1 || is_inf)
3215                         data->longest = &(data->longest_float);
3216                 }
3217                 min += min1;
3218                 if (delta == SSize_t_MAX
3219                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3220                     delta = SSize_t_MAX;
3221                 else
3222                     delta += max1 - min1;
3223                 if (flags & SCF_DO_STCLASS_OR) {
3224                     cl_or(pRExC_state, data->start_class, &accum);
3225                     if (min1) {
3226                         cl_and(data->start_class, and_withp);
3227                         flags &= ~SCF_DO_STCLASS;
3228                     }
3229                 }
3230                 else if (flags & SCF_DO_STCLASS_AND) {
3231                     if (min1) {
3232                         cl_and(data->start_class, &accum);
3233                         flags &= ~SCF_DO_STCLASS;
3234                     }
3235                     else {
3236                         /* Switch to OR mode: cache the old value of
3237                          * data->start_class */
3238                         INIT_AND_WITHP;
3239                         StructCopy(data->start_class, and_withp,
3240                                    struct regnode_charclass_class);
3241                         flags &= ~SCF_DO_STCLASS_AND;
3242                         StructCopy(&accum, data->start_class,
3243                                    struct regnode_charclass_class);
3244                         flags |= SCF_DO_STCLASS_OR;
3245                         SET_SSC_EOS(data->start_class);
3246                     }
3247                 }
3248
3249                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3250                 /* demq.
3251
3252                    Assuming this was/is a branch we are dealing with: 'scan' now
3253                    points at the item that follows the branch sequence, whatever
3254                    it is. We now start at the beginning of the sequence and look
3255                    for subsequences of
3256
3257                    BRANCH->EXACT=>x1
3258                    BRANCH->EXACT=>x2
3259                    tail
3260
3261                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3262
3263                    If we can find such a subsequence we need to turn the first
3264                    element into a trie and then add the subsequent branch exact
3265                    strings to the trie.
3266
3267                    We have two cases
3268
3269                      1. patterns where the whole set of branches can be converted.
3270
3271                      2. patterns where only a subset can be converted.
3272
3273                    In case 1 we can replace the whole set with a single regop
3274                    for the trie. In case 2 we need to keep the start and end
3275                    branches so
3276
3277                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3278                      becomes BRANCH TRIE; BRANCH X;
3279
3280                   There is an additional case, that being where there is a
3281                   common prefix, which gets split out into an EXACT like node
3282                   preceding the TRIE node.
3283
3284                   If x(1..n)==tail then we can do a simple trie, if not we make
3285                   a "jump" trie, such that when we match the appropriate word
3286                   we "jump" to the appropriate tail node. Essentially we turn
3287                   a nested if into a case structure of sorts.
3288
3289                 */
3290
3291                     int made=0;
3292                     if (!re_trie_maxbuff) {
3293                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3294                         if (!SvIOK(re_trie_maxbuff))
3295                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3296                     }
3297                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3298                         regnode *cur;
3299                         regnode *first = (regnode *)NULL;
3300                         regnode *last = (regnode *)NULL;
3301                         regnode *tail = scan;
3302                         U8 trietype = 0;
3303                         U32 count=0;
3304
3305 #ifdef DEBUGGING
3306                         SV * const mysv = sv_newmortal();       /* for dumping */
3307 #endif
3308                         /* var tail is used because there may be a TAIL
3309                            regop in the way. Ie, the exacts will point to the
3310                            thing following the TAIL, but the last branch will
3311                            point at the TAIL. So we advance tail. If we
3312                            have nested (?:) we may have to move through several
3313                            tails.
3314                          */
3315
3316                         while ( OP( tail ) == TAIL ) {
3317                             /* this is the TAIL generated by (?:) */
3318                             tail = regnext( tail );
3319                         }
3320
3321
3322                         DEBUG_TRIE_COMPILE_r({
3323                             regprop(RExC_rx, mysv, tail );
3324                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3325                                 (int)depth * 2 + 2, "",
3326                                 "Looking for TRIE'able sequences. Tail node is: ",
3327                                 SvPV_nolen_const( mysv )
3328                             );
3329                         });
3330
3331                         /*
3332
3333                             Step through the branches
3334                                 cur represents each branch,
3335                                 noper is the first thing to be matched as part of that branch
3336                                 noper_next is the regnext() of that node.
3337
3338                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3339                             via a "jump trie" but we also support building with NOJUMPTRIE,
3340                             which restricts the trie logic to structures like /FOO|BAR/.
3341
3342                             If noper is a trieable nodetype then the branch is a possible optimization
3343                             target. If we are building under NOJUMPTRIE then we require that noper_next
3344                             is the same as scan (our current position in the regex program).
3345
3346                             Once we have two or more consecutive such branches we can create a
3347                             trie of the EXACT's contents and stitch it in place into the program.
3348
3349                             If the sequence represents all of the branches in the alternation we
3350                             replace the entire thing with a single TRIE node.
3351
3352                             Otherwise when it is a subsequence we need to stitch it in place and
3353                             replace only the relevant branches. This means the first branch has
3354                             to remain as it is used by the alternation logic, and its next pointer,
3355                             and needs to be repointed at the item on the branch chain following
3356                             the last branch we have optimized away.
3357
3358                             This could be either a BRANCH, in which case the subsequence is internal,
3359                             or it could be the item following the branch sequence in which case the
3360                             subsequence is at the end (which does not necessarily mean the first node
3361                             is the start of the alternation).
3362
3363                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3364
3365                                 optype          |  trietype
3366                                 ----------------+-----------
3367                                 NOTHING         | NOTHING
3368                                 EXACT           | EXACT
3369                                 EXACTFU         | EXACTFU
3370                                 EXACTFU_SS      | EXACTFU
3371                                 EXACTFA         | EXACTFA
3372
3373
3374                         */
3375 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3376                        ( EXACT == (X) )   ? EXACT :        \
3377                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3378                        ( EXACTFA == (X) ) ? EXACTFA :        \
3379                        0 )
3380
3381                         /* dont use tail as the end marker for this traverse */
3382                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3383                             regnode * const noper = NEXTOPER( cur );
3384                             U8 noper_type = OP( noper );
3385                             U8 noper_trietype = TRIE_TYPE( noper_type );
3386 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3387                             regnode * const noper_next = regnext( noper );
3388                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3389                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3390 #endif
3391
3392                             DEBUG_TRIE_COMPILE_r({
3393                                 regprop(RExC_rx, mysv, cur);
3394                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3395                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3396
3397                                 regprop(RExC_rx, mysv, noper);
3398                                 PerlIO_printf( Perl_debug_log, " -> %s",
3399                                     SvPV_nolen_const(mysv));
3400
3401                                 if ( noper_next ) {
3402                                   regprop(RExC_rx, mysv, noper_next );
3403                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3404                                     SvPV_nolen_const(mysv));
3405                                 }
3406                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3407                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3408                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3409                                 );
3410                             });
3411
3412                             /* Is noper a trieable nodetype that can be merged with the
3413                              * current trie (if there is one)? */
3414                             if ( noper_trietype
3415                                   &&
3416                                   (
3417                                         ( noper_trietype == NOTHING)
3418                                         || ( trietype == NOTHING )
3419                                         || ( trietype == noper_trietype )
3420                                   )
3421 #ifdef NOJUMPTRIE
3422                                   && noper_next == tail
3423 #endif
3424                                   && count < U16_MAX)
3425                             {
3426                                 /* Handle mergable triable node
3427                                  * Either we are the first node in a new trieable sequence,
3428                                  * in which case we do some bookkeeping, otherwise we update
3429                                  * the end pointer. */
3430                                 if ( !first ) {
3431                                     first = cur;
3432                                     if ( noper_trietype == NOTHING ) {
3433 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3434                                         regnode * const noper_next = regnext( noper );
3435                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3436                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3437 #endif
3438
3439                                         if ( noper_next_trietype ) {
3440                                             trietype = noper_next_trietype;
3441                                         } else if (noper_next_type)  {
3442                                             /* a NOTHING regop is 1 regop wide. We need at least two
3443                                              * for a trie so we can't merge this in */
3444                                             first = NULL;
3445                                         }
3446                                     } else {
3447                                         trietype = noper_trietype;
3448                                     }
3449                                 } else {
3450                                     if ( trietype == NOTHING )
3451                                         trietype = noper_trietype;
3452                                     last = cur;
3453                                 }
3454                                 if (first)
3455                                     count++;
3456                             } /* end handle mergable triable node */
3457                             else {
3458                                 /* handle unmergable node -
3459                                  * noper may either be a triable node which can not be tried
3460                                  * together with the current trie, or a non triable node */
3461                                 if ( last ) {
3462                                     /* If last is set and trietype is not NOTHING then we have found
3463                                      * at least two triable branch sequences in a row of a similar
3464                                      * trietype so we can turn them into a trie. If/when we
3465                                      * allow NOTHING to start a trie sequence this condition will be
3466                                      * required, and it isn't expensive so we leave it in for now. */
3467                                     if ( trietype && trietype != NOTHING )
3468                                         make_trie( pRExC_state,
3469                                                 startbranch, first, cur, tail, count,
3470                                                 trietype, depth+1 );
3471                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3472                                 }
3473                                 if ( noper_trietype
3474 #ifdef NOJUMPTRIE
3475                                      && noper_next == tail
3476 #endif
3477                                 ){
3478                                     /* noper is triable, so we can start a new trie sequence */
3479                                     count = 1;
3480                                     first = cur;
3481                                     trietype = noper_trietype;
3482                                 } else if (first) {
3483                                     /* if we already saw a first but the current node is not triable then we have
3484                                      * to reset the first information. */
3485                                     count = 0;
3486                                     first = NULL;
3487                                     trietype = 0;
3488                                 }
3489                             } /* end handle unmergable node */
3490                         } /* loop over branches */
3491                         DEBUG_TRIE_COMPILE_r({
3492                             regprop(RExC_rx, mysv, cur);
3493                             PerlIO_printf( Perl_debug_log,
3494                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3495                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3496
3497                         });
3498                         if ( last && trietype ) {
3499                             if ( trietype != NOTHING ) {
3500                                 /* the last branch of the sequence was part of a trie,
3501                                  * so we have to construct it here outside of the loop
3502                                  */
3503                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3504 #ifdef TRIE_STUDY_OPT
3505                                 if ( ((made == MADE_EXACT_TRIE &&
3506                                      startbranch == first)
3507                                      || ( first_non_open == first )) &&
3508                                      depth==0 ) {
3509                                     flags |= SCF_TRIE_RESTUDY;
3510                                     if ( startbranch == first
3511                                          && scan == tail )
3512                                     {
3513                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3514                                     }
3515                                 }
3516 #endif
3517                             } else {
3518                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3519                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3520                                  */
3521                                 if ( startbranch == first ) {
3522                                     regnode *opt;
3523                                     /* the entire thing is a NOTHING sequence, something like this:
3524                                      * (?:|) So we can turn it into a plain NOTHING op. */
3525                                     DEBUG_TRIE_COMPILE_r({
3526                                         regprop(RExC_rx, mysv, cur);
3527                                         PerlIO_printf( Perl_debug_log,
3528                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3529                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3530
3531                                     });
3532                                     OP(startbranch)= NOTHING;
3533                                     NEXT_OFF(startbranch)= tail - startbranch;
3534                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3535                                         OP(opt)= OPTIMIZED;
3536                                 }
3537                             }
3538                         } /* end if ( last) */
3539                     } /* TRIE_MAXBUF is non zero */
3540
3541                 } /* do trie */
3542
3543             }
3544             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3545                 scan = NEXTOPER(NEXTOPER(scan));
3546             } else                      /* single branch is optimized. */
3547                 scan = NEXTOPER(scan);
3548             continue;
3549         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3550             scan_frame *newframe = NULL;
3551             I32 paren;
3552             regnode *start;
3553             regnode *end;
3554
3555             if (OP(scan) != SUSPEND) {
3556             /* set the pointer */
3557                 if (OP(scan) == GOSUB) {
3558                     paren = ARG(scan);
3559                     RExC_recurse[ARG2L(scan)] = scan;
3560                     start = RExC_open_parens[paren-1];
3561                     end   = RExC_close_parens[paren-1];
3562                 } else {
3563                     paren = 0;
3564                     start = RExC_rxi->program + 1;
3565                     end   = RExC_opend;
3566                 }
3567                 if (!recursed) {
3568                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3569                     SAVEFREEPV(recursed);
3570                 }
3571                 if (!PAREN_TEST(recursed,paren+1)) {
3572                     PAREN_SET(recursed,paren+1);
3573                     Newx(newframe,1,scan_frame);
3574                 } else {
3575                     if (flags & SCF_DO_SUBSTR) {
3576                         SCAN_COMMIT(pRExC_state,data,minlenp);
3577                         data->longest = &(data->longest_float);
3578                     }
3579                     is_inf = is_inf_internal = 1;
3580                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3581                         cl_anything(pRExC_state, data->start_class);
3582                     flags &= ~SCF_DO_STCLASS;
3583                 }
3584             } else {
3585                 Newx(newframe,1,scan_frame);
3586                 paren = stopparen;
3587                 start = scan+2;
3588                 end = regnext(scan);
3589             }
3590             if (newframe) {
3591                 assert(start);
3592                 assert(end);
3593                 SAVEFREEPV(newframe);
3594                 newframe->next = regnext(scan);
3595                 newframe->last = last;
3596                 newframe->stop = stopparen;
3597                 newframe->prev = frame;
3598
3599                 frame = newframe;
3600                 scan =  start;
3601                 stopparen = paren;
3602                 last = end;
3603
3604                 continue;
3605             }
3606         }
3607         else if (OP(scan) == EXACT) {
3608             SSize_t l = STR_LEN(scan);
3609             UV uc;
3610             if (UTF) {
3611                 const U8 * const s = (U8*)STRING(scan);
3612                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3613                 l = utf8_length(s, s + l);
3614             } else {
3615                 uc = *((U8*)STRING(scan));
3616             }
3617             min += l;
3618             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3619                 /* The code below prefers earlier match for fixed
3620                    offset, later match for variable offset.  */
3621                 if (data->last_end == -1) { /* Update the start info. */
3622                     data->last_start_min = data->pos_min;
3623                     data->last_start_max = is_inf
3624                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3625                 }
3626                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3627                 if (UTF)
3628                     SvUTF8_on(data->last_found);
3629                 {
3630                     SV * const sv = data->last_found;
3631                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3632                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3633                     if (mg && mg->mg_len >= 0)
3634                         mg->mg_len += utf8_length((U8*)STRING(scan),
3635                                                   (U8*)STRING(scan)+STR_LEN(scan));
3636                 }
3637                 data->last_end = data->pos_min + l;
3638                 data->pos_min += l; /* As in the first entry. */
3639                 data->flags &= ~SF_BEFORE_EOL;
3640             }
3641             if (flags & SCF_DO_STCLASS_AND) {
3642                 /* Check whether it is compatible with what we know already! */
3643                 int compat = 1;
3644
3645
3646                 /* If compatible, we or it in below.  It is compatible if is
3647                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3648                  * it's for a locale.  Even if there isn't unicode semantics
3649                  * here, at runtime there may be because of matching against a
3650                  * utf8 string, so accept a possible false positive for
3651                  * latin1-range folds */
3652                 if (uc >= 0x100 ||
3653                     (!(data->start_class->flags & ANYOF_LOCALE)
3654                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3655                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3656                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3657                     )
3658                 {
3659                     compat = 0;
3660                 }
3661                 ANYOF_CLASS_ZERO(data->start_class);
3662                 ANYOF_BITMAP_ZERO(data->start_class);
3663                 if (compat)
3664                     ANYOF_BITMAP_SET(data->start_class, uc);
3665                 else if (uc >= 0x100) {
3666                     int i;
3667
3668                     /* Some Unicode code points fold to the Latin1 range; as
3669                      * XXX temporary code, instead of figuring out if this is
3670                      * one, just assume it is and set all the start class bits
3671                      * that could be some such above 255 code point's fold
3672                      * which will generate fals positives.  As the code
3673                      * elsewhere that does compute the fold settles down, it
3674                      * can be extracted out and re-used here */
3675                     for (i = 0; i < 256; i++){
3676                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3677                             ANYOF_BITMAP_SET(data->start_class, i);
3678                         }
3679                     }
3680                 }
3681                 CLEAR_SSC_EOS(data->start_class);
3682                 if (uc < 0x100)
3683                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3684             }
3685             else if (flags & SCF_DO_STCLASS_OR) {
3686                 /* false positive possible if the class is case-folded */
3687                 if (uc < 0x100)
3688                     ANYOF_BITMAP_SET(data->start_class, uc);
3689                 else
3690                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3691                 CLEAR_SSC_EOS(data->start_class);
3692                 cl_and(data->start_class, and_withp);
3693             }
3694             flags &= ~SCF_DO_STCLASS;
3695         }
3696         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3697             SSize_t l = STR_LEN(scan);
3698             UV uc = *((U8*)STRING(scan));
3699
3700             /* Search for fixed substrings supports EXACT only. */
3701             if (flags & SCF_DO_SUBSTR) {
3702                 assert(data);
3703                 SCAN_COMMIT(pRExC_state, data, minlenp);
3704             }
3705             if (UTF) {
3706                 const U8 * const s = (U8 *)STRING(scan);
3707                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3708                 l = utf8_length(s, s + l);
3709             }
3710             if (has_exactf_sharp_s) {
3711                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3712             }
3713             min += l - min_subtract;
3714             assert (min >= 0);
3715             delta += min_subtract;
3716             if (flags & SCF_DO_SUBSTR) {
3717                 data->pos_min += l - min_subtract;
3718                 if (data->pos_min < 0) {
3719                     data->pos_min = 0;
3720                 }
3721                 data->pos_delta += min_subtract;
3722                 if (min_subtract) {
3723                     data->longest = &(data->longest_float);
3724                 }
3725             }
3726             if (flags & SCF_DO_STCLASS_AND) {
3727                 /* Check whether it is compatible with what we know already! */
3728                 int compat = 1;
3729                 if (uc >= 0x100 ||
3730                  (!(data->start_class->flags & ANYOF_LOCALE)
3731                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3732                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3733                 {
3734                     compat = 0;
3735                 }
3736                 ANYOF_CLASS_ZERO(data->start_class);
3737                 ANYOF_BITMAP_ZERO(data->start_class);
3738                 if (compat) {
3739                     ANYOF_BITMAP_SET(data->start_class, uc);
3740                     CLEAR_SSC_EOS(data->start_class);
3741                     if (OP(scan) == EXACTFL) {
3742                         /* XXX This set is probably no longer necessary, and
3743                          * probably wrong as LOCALE now is on in the initial
3744                          * state */
3745                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3746                     }
3747                     else {
3748
3749                         /* Also set the other member of the fold pair.  In case
3750                          * that unicode semantics is called for at runtime, use
3751                          * the full latin1 fold.  (Can't do this for locale,
3752                          * because not known until runtime) */
3753                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3754
3755                         /* All other (EXACTFL handled above) folds except under
3756                          * /iaa that include s, S, and sharp_s also may include
3757                          * the others */
3758                         if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3759                         {
3760                             if (uc == 's' || uc == 'S') {
3761                                 ANYOF_BITMAP_SET(data->start_class,
3762                                                  LATIN_SMALL_LETTER_SHARP_S);
3763                             }
3764                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3765                                 ANYOF_BITMAP_SET(data->start_class, 's');
3766                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3767                             }
3768                         }
3769                     }
3770                 }
3771                 else if (uc >= 0x100) {
3772                     int i;
3773                     for (i = 0; i < 256; i++){
3774                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3775                             ANYOF_BITMAP_SET(data->start_class, i);
3776                         }
3777                     }
3778                 }
3779             }
3780             else if (flags & SCF_DO_STCLASS_OR) {
3781                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3782                     /* false positive possible if the class is case-folded.
3783                        Assume that the locale settings are the same... */
3784                     if (uc < 0x100) {
3785                         ANYOF_BITMAP_SET(data->start_class, uc);
3786                         if (OP(scan) != EXACTFL) {
3787
3788                             /* And set the other member of the fold pair, but
3789                              * can't do that in locale because not known until
3790                              * run-time */
3791                             ANYOF_BITMAP_SET(data->start_class,
3792                                              PL_fold_latin1[uc]);
3793
3794                             /* All folds except under /iaa that include s, S,
3795                              * and sharp_s also may include the others */
3796                             if (OP(scan) != EXACTFA
3797                                 && OP(scan) != EXACTFA_NO_TRIE)
3798                             {
3799                                 if (uc == 's' || uc == 'S') {
3800                                     ANYOF_BITMAP_SET(data->start_class,
3801                                                    LATIN_SMALL_LETTER_SHARP_S);
3802                                 }
3803                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3804                                     ANYOF_BITMAP_SET(data->start_class, 's');
3805                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3806                                 }
3807                             }
3808                         }
3809                     }
3810                     CLEAR_SSC_EOS(data->start_class);
3811                 }
3812                 cl_and(data->start_class, and_withp);
3813             }
3814             flags &= ~SCF_DO_STCLASS;
3815         }
3816         else if (REGNODE_VARIES(OP(scan))) {
3817             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3818             I32 fl = 0, f = flags;
3819             regnode * const oscan = scan;
3820             struct regnode_charclass_class this_class;
3821             struct regnode_charclass_class *oclass = NULL;
3822             I32 next_is_eval = 0;
3823
3824             switch (PL_regkind[OP(scan)]) {
3825             case WHILEM:                /* End of (?:...)* . */
3826                 scan = NEXTOPER(scan);
3827                 goto finish;
3828             case PLUS:
3829                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3830                     next = NEXTOPER(scan);
3831                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3832                         mincount = 1;
3833                         maxcount = REG_INFTY;
3834                         next = regnext(scan);
3835                         scan = NEXTOPER(scan);
3836                         goto do_curly;
3837                     }
3838                 }
3839                 if (flags & SCF_DO_SUBSTR)
3840                     data->pos_min++;
3841                 min++;
3842                 /* Fall through. */
3843             case STAR:
3844                 if (flags & SCF_DO_STCLASS) {
3845                     mincount = 0;
3846                     maxcount = REG_INFTY;
3847                     next = regnext(scan);
3848                     scan = NEXTOPER(scan);
3849                     goto do_curly;
3850                 }
3851                 is_inf = is_inf_internal = 1;
3852                 scan = regnext(scan);
3853                 if (flags & SCF_DO_SUBSTR) {
3854                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3855                     data->longest = &(data->longest_float);
3856                 }
3857                 goto optimize_curly_tail;
3858             case CURLY:
3859                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3860                     && (scan->flags == stopparen))
3861                 {
3862                     mincount = 1;
3863                     maxcount = 1;
3864                 } else {
3865                     mincount = ARG1(scan);
3866                     maxcount = ARG2(scan);
3867                 }
3868                 next = regnext(scan);
3869                 if (OP(scan) == CURLYX) {
3870                     I32 lp = (data ? *(data->last_closep) : 0);
3871                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3872                 }
3873                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3874                 next_is_eval = (OP(scan) == EVAL);
3875               do_curly:
3876                 if (flags & SCF_DO_SUBSTR) {
3877                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3878                     pos_before = data->pos_min;
3879                 }
3880                 if (data) {
3881                     fl = data->flags;
3882                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3883                     if (is_inf)
3884                         data->flags |= SF_IS_INF;
3885                 }
3886                 if (flags & SCF_DO_STCLASS) {
3887                     cl_init(pRExC_state, &this_class);
3888                     oclass = data->start_class;
3889                     data->start_class = &this_class;
3890                     f |= SCF_DO_STCLASS_AND;
3891                     f &= ~SCF_DO_STCLASS_OR;
3892                 }
3893                 /* Exclude from super-linear cache processing any {n,m}
3894                    regops for which the combination of input pos and regex
3895                    pos is not enough information to determine if a match
3896                    will be possible.
3897
3898                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3899                    regex pos at the \s*, the prospects for a match depend not
3900                    only on the input position but also on how many (bar\s*)
3901                    repeats into the {4,8} we are. */
3902                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3903                     f &= ~SCF_WHILEM_VISITED_POS;
3904
3905                 /* This will finish on WHILEM, setting scan, or on NULL: */
3906                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3907                                       last, data, stopparen, recursed, NULL,
3908                                       (mincount == 0
3909                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3910
3911                 if (flags & SCF_DO_STCLASS)
3912                     data->start_class = oclass;
3913                 if (mincount == 0 || minnext == 0) {
3914                     if (flags & SCF_DO_STCLASS_OR) {
3915                         cl_or(pRExC_state, data->start_class, &this_class);
3916                     }
3917                     else if (flags & SCF_DO_STCLASS_AND) {
3918                         /* Switch to OR mode: cache the old value of
3919                          * data->start_class */
3920                         INIT_AND_WITHP;
3921                         StructCopy(data->start_class, and_withp,
3922                                    struct regnode_charclass_class);
3923                         flags &= ~SCF_DO_STCLASS_AND;
3924                         StructCopy(&this_class, data->start_class,
3925                                    struct regnode_charclass_class);
3926                         flags |= SCF_DO_STCLASS_OR;
3927                         SET_SSC_EOS(data->start_class);
3928                     }
3929                 } else {                /* Non-zero len */
3930                     if (flags & SCF_DO_STCLASS_OR) {
3931                         cl_or(pRExC_state, data->start_class, &this_class);
3932                         cl_and(data->start_class, and_withp);
3933                     }
3934                     else if (flags & SCF_DO_STCLASS_AND)
3935                         cl_and(data->start_class, &this_class);
3936                     flags &= ~SCF_DO_STCLASS;
3937                 }
3938                 if (!scan)              /* It was not CURLYX, but CURLY. */
3939                     scan = next;
3940                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
3941                     /* ? quantifier ok, except for (?{ ... }) */
3942                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
3943                     && (minnext == 0) && (deltanext == 0)
3944                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3945                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3946                 {
3947                     /* Fatal warnings may leak the regexp without this: */
3948                     SAVEFREESV(RExC_rx_sv);
3949                     ckWARNreg(RExC_parse,
3950                               "Quantifier unexpected on zero-length expression");
3951                     (void)ReREFCNT_inc(RExC_rx_sv);
3952                 }
3953
3954                 min += minnext * mincount;
3955                 is_inf_internal |= deltanext == SSize_t_MAX
3956                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
3957                 is_inf |= is_inf_internal;
3958                 if (is_inf)
3959                     delta = SSize_t_MAX;
3960                 else
3961                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3962
3963                 /* Try powerful optimization CURLYX => CURLYN. */
3964                 if (  OP(oscan) == CURLYX && data
3965                       && data->flags & SF_IN_PAR
3966                       && !(data->flags & SF_HAS_EVAL)
3967                       && !deltanext && minnext == 1 ) {
3968                     /* Try to optimize to CURLYN.  */
3969                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3970                     regnode * const nxt1 = nxt;
3971 #ifdef DEBUGGING
3972                     regnode *nxt2;
3973 #endif
3974
3975                     /* Skip open. */
3976                     nxt = regnext(nxt);
3977                     if (!REGNODE_SIMPLE(OP(nxt))
3978                         && !(PL_regkind[OP(nxt)] == EXACT
3979                              && STR_LEN(nxt) == 1))
3980                         goto nogo;
3981 #ifdef DEBUGGING
3982                     nxt2 = nxt;
3983 #endif
3984                     nxt = regnext(nxt);
3985                     if (OP(nxt) != CLOSE)
3986                         goto nogo;
3987                     if (RExC_open_parens) {
3988                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3989                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3990                     }
3991                     /* Now we know that nxt2 is the only contents: */
3992                     oscan->flags = (U8)ARG(nxt);
3993                     OP(oscan) = CURLYN;
3994                     OP(nxt1) = NOTHING; /* was OPEN. */
3995
3996 #ifdef DEBUGGING
3997                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3998                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3999                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4000                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4001                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4002                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4003 #endif
4004                 }
4005               nogo:
4006
4007                 /* Try optimization CURLYX => CURLYM. */
4008                 if (  OP(oscan) == CURLYX && data
4009                       && !(data->flags & SF_HAS_PAR)
4010                       && !(data->flags & SF_HAS_EVAL)
4011                       && !deltanext     /* atom is fixed width */
4012                       && minnext != 0   /* CURLYM can't handle zero width */
4013                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4014                 ) {
4015                     /* XXXX How to optimize if data == 0? */
4016                     /* Optimize to a simpler form.  */
4017                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4018                     regnode *nxt2;
4019
4020                     OP(oscan) = CURLYM;
4021                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4022                             && (OP(nxt2) != WHILEM))
4023                         nxt = nxt2;
4024                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4025                     /* Need to optimize away parenths. */
4026                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4027                         /* Set the parenth number.  */
4028                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4029
4030                         oscan->flags = (U8)ARG(nxt);
4031                         if (RExC_open_parens) {
4032                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4033                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4034                         }
4035                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4036                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4037
4038 #ifdef DEBUGGING
4039                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4040                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4041                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4042                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4043 #endif
4044 #if 0
4045                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4046                             regnode *nnxt = regnext(nxt1);
4047                             if (nnxt == nxt) {
4048                                 if (reg_off_by_arg[OP(nxt1)])
4049                                     ARG_SET(nxt1, nxt2 - nxt1);
4050                                 else if (nxt2 - nxt1 < U16_MAX)
4051                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4052                                 else
4053                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4054                             }
4055                             nxt1 = nnxt;
4056                         }
4057 #endif
4058                         /* Optimize again: */
4059                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4060                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4061                     }
4062                     else
4063                         oscan->flags = 0;
4064                 }
4065                 else if ((OP(oscan) == CURLYX)
4066                          && (flags & SCF_WHILEM_VISITED_POS)
4067                          /* See the comment on a similar expression above.
4068                             However, this time it's not a subexpression
4069                             we care about, but the expression itself. */
4070                          && (maxcount == REG_INFTY)
4071                          && data && ++data->whilem_c < 16) {
4072                     /* This stays as CURLYX, we can put the count/of pair. */
4073                     /* Find WHILEM (as in regexec.c) */
4074                     regnode *nxt = oscan + NEXT_OFF(oscan);
4075
4076                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4077                         nxt += ARG(nxt);
4078                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4079                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4080                 }
4081                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4082                     pars++;
4083                 if (flags & SCF_DO_SUBSTR) {
4084                     SV *last_str = NULL;
4085                     int counted = mincount != 0;
4086
4087                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4088 #if defined(SPARC64_GCC_WORKAROUND)
4089                         SSize_t b = 0;
4090                         STRLEN l = 0;
4091                         const char *s = NULL;
4092                         SSize_t old = 0;
4093
4094                         if (pos_before >= data->last_start_min)
4095                             b = pos_before;
4096                         else
4097                             b = data->last_start_min;
4098
4099                         l = 0;
4100                         s = SvPV_const(data->last_found, l);
4101                         old = b - data->last_start_min;
4102
4103 #else
4104                         SSize_t b = pos_before >= data->last_start_min
4105                             ? pos_before : data->last_start_min;
4106                         STRLEN l;
4107                         const char * const s = SvPV_const(data->last_found, l);
4108                         SSize_t old = b - data->last_start_min;
4109 #endif
4110
4111                         if (UTF)
4112                             old = utf8_hop((U8*)s, old) - (U8*)s;
4113                         l -= old;
4114                         /* Get the added string: */
4115                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4116                         if (deltanext == 0 && pos_before == b) {
4117                             /* What was added is a constant string */
4118                             if (mincount > 1) {
4119                                 SvGROW(last_str, (mincount * l) + 1);
4120                                 repeatcpy(SvPVX(last_str) + l,
4121                                           SvPVX_const(last_str), l, mincount - 1);
4122                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4123                                 /* Add additional parts. */
4124                                 SvCUR_set(data->last_found,
4125                                           SvCUR(data->last_found) - l);
4126                                 sv_catsv(data->last_found, last_str);
4127                                 {
4128                                     SV * sv = data->last_found;
4129                                     MAGIC *mg =
4130                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4131                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4132                                     if (mg && mg->mg_len >= 0)
4133                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4134                                 }
4135                                 data->last_end += l * (mincount - 1);
4136                             }
4137                         } else {
4138                             /* start offset must point into the last copy */
4139                             data->last_start_min += minnext * (mincount - 1);
4140                             data->last_start_max += is_inf ? SSize_t_MAX
4141                                 : (maxcount - 1) * (minnext + data->pos_delta);
4142                         }
4143                     }
4144                     /* It is counted once already... */
4145                     data->pos_min += minnext * (mincount - counted);
4146 #if 0
4147 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4148                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4149                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4150     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4151     (UV)mincount);
4152 if (deltanext != SSize_t_MAX)
4153 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4154     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4155           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4156 #endif
4157                     if (deltanext == SSize_t_MAX ||
4158                         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4159                         data->pos_delta = SSize_t_MAX;
4160                     else
4161                         data->pos_delta += - counted * deltanext +
4162                         (minnext + deltanext) * maxcount - minnext * mincount;
4163                     if (mincount != maxcount) {
4164                          /* Cannot extend fixed substrings found inside
4165                             the group.  */
4166                         SCAN_COMMIT(pRExC_state,data,minlenp);
4167                         if (mincount && last_str) {
4168                             SV * const sv = data->last_found;
4169                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4170                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4171
4172                             if (mg)
4173                                 mg->mg_len = -1;
4174                             sv_setsv(sv, last_str);
4175                             data->last_end = data->pos_min;
4176                             data->last_start_min =
4177                                 data->pos_min - CHR_SVLEN(last_str);
4178                             data->last_start_max = is_inf
4179                                 ? SSize_t_MAX
4180                                 : data->pos_min + data->pos_delta
4181                                 - CHR_SVLEN(last_str);
4182                         }
4183                         data->longest = &(data->longest_float);
4184                     }
4185                     SvREFCNT_dec(last_str);
4186                 }
4187                 if (data && (fl & SF_HAS_EVAL))
4188                     data->flags |= SF_HAS_EVAL;
4189               optimize_curly_tail:
4190                 if (OP(oscan) != CURLYX) {
4191                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4192                            && NEXT_OFF(next))
4193                         NEXT_OFF(oscan) += NEXT_OFF(next);
4194                 }
4195                 continue;
4196             default:                    /* REF, and CLUMP only? */
4197                 if (flags & SCF_DO_SUBSTR) {
4198                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4199                     data->longest = &(data->longest_float);
4200                 }
4201                 is_inf = is_inf_internal = 1;
4202                 if (flags & SCF_DO_STCLASS_OR)
4203                     cl_anything(pRExC_state, data->start_class);
4204                 flags &= ~SCF_DO_STCLASS;
4205                 break;
4206             }
4207         }
4208         else if (OP(scan) == LNBREAK) {
4209             if (flags & SCF_DO_STCLASS) {
4210                 int value = 0;
4211                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4212                 if (flags & SCF_DO_STCLASS_AND) {
4213                     for (value = 0; value < 256; value++)
4214                         if (!is_VERTWS_cp(value))
4215                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4216                 }
4217                 else {
4218                     for (value = 0; value < 256; value++)
4219                         if (is_VERTWS_cp(value))
4220                             ANYOF_BITMAP_SET(data->start_class, value);
4221                 }
4222                 if (flags & SCF_DO_STCLASS_OR)
4223                     cl_and(data->start_class, and_withp);
4224                 flags &= ~SCF_DO_STCLASS;
4225             }
4226             min++;
4227             delta++;    /* Because of the 2 char string cr-lf */
4228             if (flags & SCF_DO_SUBSTR) {
4229                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4230                 data->pos_min += 1;
4231                 data->pos_delta += 1;
4232                 data->longest = &(data->longest_float);
4233             }
4234         }
4235         else if (REGNODE_SIMPLE(OP(scan))) {
4236             int value = 0;
4237
4238             if (flags & SCF_DO_SUBSTR) {
4239                 SCAN_COMMIT(pRExC_state,data,minlenp);
4240                 data->pos_min++;
4241             }
4242             min++;
4243             if (flags & SCF_DO_STCLASS) {
4244                 int loop_max = 256;
4245                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4246
4247                 /* Some of the logic below assumes that switching
4248                    locale on will only add false positives. */
4249                 switch (PL_regkind[OP(scan)]) {
4250                     U8 classnum;
4251
4252                 case SANY:
4253                 default:
4254 #ifdef DEBUGGING
4255                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4256 #endif
4257                  do_default:
4258                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4259                         cl_anything(pRExC_state, data->start_class);
4260                     break;
4261                 case REG_ANY:
4262                     if (OP(scan) == SANY)
4263                         goto do_default;
4264                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4265                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4266                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4267                         cl_anything(pRExC_state, data->start_class);
4268                     }
4269                     if (flags & SCF_DO_STCLASS_AND || !value)
4270                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4271                     break;
4272                 case ANYOF:
4273                     if (flags & SCF_DO_STCLASS_AND)
4274                         cl_and(data->start_class,
4275                                (struct regnode_charclass_class*)scan);
4276                     else
4277                         cl_or(pRExC_state, data->start_class,
4278                               (struct regnode_charclass_class*)scan);
4279                     break;
4280                 case POSIXA:
4281                     loop_max = 128;
4282                     /* FALL THROUGH */
4283                 case POSIXL:
4284                 case POSIXD:
4285                 case POSIXU:
4286                     classnum = FLAGS(scan);
4287                     if (flags & SCF_DO_STCLASS_AND) {
4288                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4289                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4290                             for (value = 0; value < loop_max; value++) {
4291                                 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4292                                     ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4293                                 }
4294                             }
4295                         }
4296                     }
4297                     else {
4298                         if (data->start_class->flags & ANYOF_LOCALE) {
4299                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4300                         }
4301                         else {
4302
4303                         /* Even if under locale, set the bits for non-locale
4304                          * in case it isn't a true locale-node.  This will
4305                          * create false positives if it truly is locale */
4306                         for (value = 0; value < loop_max; value++) {
4307                             if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4308                                 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4309                             }
4310                         }
4311                         }
4312                     }
4313                     break;
4314                 case NPOSIXA:
4315                     loop_max = 128;
4316                     /* FALL THROUGH */
4317                 case NPOSIXL:
4318                 case NPOSIXU:
4319                 case NPOSIXD:
4320                     classnum = FLAGS(scan);
4321                     if (flags & SCF_DO_STCLASS_AND) {
4322                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4323                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4324                             for (value = 0; value < loop_max; value++) {
4325                                 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4326                                     ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4327                                 }
4328                             }
4329                         }
4330                     }
4331                     else {
4332                         if (data->start_class->flags & ANYOF_LOCALE) {
4333                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4334                         }
4335                         else {
4336
4337                         /* Even if under locale, set the bits for non-locale in
4338                          * case it isn't a true locale-node.  This will create
4339                          * false positives if it truly is locale */
4340                         for (value = 0; value < loop_max; value++) {
4341                             if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4342                                 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4343                             }
4344                         }
4345                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4346                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4347                         }
4348                         }
4349                     }
4350                     break;
4351                 }
4352                 if (flags & SCF_DO_STCLASS_OR)
4353                     cl_and(data->start_class, and_withp);
4354                 flags &= ~SCF_DO_STCLASS;
4355             }
4356         }
4357         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4358             data->flags |= (OP(scan) == MEOL
4359                             ? SF_BEFORE_MEOL
4360                             : SF_BEFORE_SEOL);
4361             SCAN_COMMIT(pRExC_state, data, minlenp);
4362
4363         }
4364         else if (  PL_regkind[OP(scan)] == BRANCHJ
4365                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4366                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4367                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4368             if ( OP(scan) == UNLESSM &&
4369                  scan->flags == 0 &&
4370                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4371                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4372             ) {
4373                 regnode *opt;
4374                 regnode *upto= regnext(scan);
4375                 DEBUG_PARSE_r({
4376                     SV * const mysv_val=sv_newmortal();
4377                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4378
4379                     /*DEBUG_PARSE_MSG("opfail");*/
4380                     regprop(RExC_rx, mysv_val, upto);
4381                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4382                                   SvPV_nolen_const(mysv_val),
4383                                   (IV)REG_NODE_NUM(upto),
4384                                   (IV)(upto - scan)
4385                     );
4386                 });
4387                 OP(scan) = OPFAIL;
4388                 NEXT_OFF(scan) = upto - scan;
4389                 for (opt= scan + 1; opt < upto ; opt++)
4390                     OP(opt) = OPTIMIZED;
4391                 scan= upto;
4392                 continue;
4393             }
4394             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4395                 || OP(scan) == UNLESSM )
4396             {
4397                 /* Negative Lookahead/lookbehind
4398                    In this case we can't do fixed string optimisation.
4399                 */
4400
4401                 SSize_t deltanext, minnext, fake = 0;
4402                 regnode *nscan;
4403                 struct regnode_charclass_class intrnl;
4404                 int f = 0;
4405
4406                 data_fake.flags = 0;
4407                 if (data) {
4408                     data_fake.whilem_c = data->whilem_c;
4409                     data_fake.last_closep = data->last_closep;
4410                 }
4411                 else
4412                     data_fake.last_closep = &fake;
4413                 data_fake.pos_delta = delta;
4414                 if ( flags & SCF_DO_STCLASS && !scan->flags
4415                      && OP(scan) == IFMATCH ) { /* Lookahead */
4416                     cl_init(pRExC_state, &intrnl);
4417                     data_fake.start_class = &intrnl;
4418                     f |= SCF_DO_STCLASS_AND;
4419                 }
4420                 if (flags & SCF_WHILEM_VISITED_POS)
4421                     f |= SCF_WHILEM_VISITED_POS;
4422                 next = regnext(scan);
4423                 nscan = NEXTOPER(NEXTOPER(scan));
4424                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4425                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4426                 if (scan->flags) {
4427                     if (deltanext) {
4428                         FAIL("Variable length lookbehind not implemented");
4429                     }
4430                     else if (minnext > (I32)U8_MAX) {
4431                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4432                     }
4433                     scan->flags = (U8)minnext;
4434                 }
4435                 if (data) {
4436                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4437                         pars++;
4438                     if (data_fake.flags & SF_HAS_EVAL)
4439                         data->flags |= SF_HAS_EVAL;
4440                     data->whilem_c = data_fake.whilem_c;
4441                 }
4442                 if (f & SCF_DO_STCLASS_AND) {
4443                     if (flags & SCF_DO_STCLASS_OR) {
4444                         /* OR before, AND after: ideally we would recurse with
4445                          * data_fake to get the AND applied by study of the
4446                          * remainder of the pattern, and then derecurse;
4447                          * *** HACK *** for now just treat as "no information".
4448                          * See [perl #56690].
4449                          */
4450                         cl_init(pRExC_state, data->start_class);
4451                     }  else {
4452                         /* AND before and after: combine and continue */
4453                         const int was = TEST_SSC_EOS(data->start_class);
4454
4455                         cl_and(data->start_class, &intrnl);
4456                         if (was)
4457                             SET_SSC_EOS(data->start_class);
4458                     }
4459                 }
4460             }
4461 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4462             else {
4463                 /* Positive Lookahead/lookbehind
4464                    In this case we can do fixed string optimisation,
4465                    but we must be careful about it. Note in the case of
4466                    lookbehind the positions will be offset by the minimum
4467                    length of the pattern, something we won't know about
4468                    until after the recurse.
4469                 */
4470                 SSize_t deltanext;
4471                 I32 fake = 0;
4472                 regnode *nscan;
4473                 struct regnode_charclass_class intrnl;
4474                 int f = 0;
4475                 /* We use SAVEFREEPV so that when the full compile
4476                     is finished perl will clean up the allocated
4477                     minlens when it's all done. This way we don't
4478                     have to worry about freeing them when we know
4479                     they wont be used, which would be a pain.
4480                  */
4481                 SSize_t *minnextp;
4482                 Newx( minnextp, 1, SSize_t );
4483                 SAVEFREEPV(minnextp);
4484
4485                 if (data) {
4486                     StructCopy(data, &data_fake, scan_data_t);
4487                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4488                         f |= SCF_DO_SUBSTR;
4489                         if (scan->flags)
4490                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4491                         data_fake.last_found=newSVsv(data->last_found);
4492                     }
4493                 }
4494                 else
4495                     data_fake.last_closep = &fake;
4496                 data_fake.flags = 0;
4497                 data_fake.pos_delta = delta;
4498                 if (is_inf)
4499                     data_fake.flags |= SF_IS_INF;
4500                 if ( flags & SCF_DO_STCLASS && !scan->flags
4501                      && OP(scan) == IFMATCH ) { /* Lookahead */
4502                     cl_init(pRExC_state, &intrnl);
4503                     data_fake.start_class = &intrnl;
4504                     f |= SCF_DO_STCLASS_AND;
4505                 }
4506                 if (flags & SCF_WHILEM_VISITED_POS)
4507                     f |= SCF_WHILEM_VISITED_POS;
4508                 next = regnext(scan);
4509                 nscan = NEXTOPER(NEXTOPER(scan));
4510
4511                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4512                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4513                 if (scan->flags) {
4514                     if (deltanext) {
4515                         FAIL("Variable length lookbehind not implemented");
4516                     }
4517                     else if (*minnextp > (I32)U8_MAX) {
4518                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4519                     }
4520                     scan->flags = (U8)*minnextp;
4521                 }
4522
4523                 *minnextp += min;
4524
4525                 if (f & SCF_DO_STCLASS_AND) {
4526                     const int was = TEST_SSC_EOS(data.start_class);
4527
4528                     cl_and(data->start_class, &intrnl);
4529                     if (was)
4530                         SET_SSC_EOS(data->start_class);
4531                 }
4532                 if (data) {
4533                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4534                         pars++;
4535                     if (data_fake.flags & SF_HAS_EVAL)
4536                         data->flags |= SF_HAS_EVAL;
4537                     data->whilem_c = data_fake.whilem_c;
4538                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4539                         if (RExC_rx->minlen<*minnextp)
4540                             RExC_rx->minlen=*minnextp;
4541                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4542                         SvREFCNT_dec_NN(data_fake.last_found);
4543
4544                         if ( data_fake.minlen_fixed != minlenp )
4545                         {
4546                             data->offset_fixed= data_fake.offset_fixed;
4547                             data->minlen_fixed= data_fake.minlen_fixed;
4548                             data->lookbehind_fixed+= scan->flags;
4549                         }
4550                         if ( data_fake.minlen_float != minlenp )
4551                         {
4552                             data->minlen_float= data_fake.minlen_float;
4553                             data->offset_float_min=data_fake.offset_float_min;
4554                             data->offset_float_max=data_fake.offset_float_max;
4555                             data->lookbehind_float+= scan->flags;
4556                         }
4557                     }
4558                 }
4559             }
4560 #endif
4561         }
4562         else if (OP(scan) == OPEN) {
4563             if (stopparen != (I32)ARG(scan))
4564                 pars++;
4565         }
4566         else if (OP(scan) == CLOSE) {
4567             if (stopparen == (I32)ARG(scan)) {
4568                 break;
4569             }
4570             if ((I32)ARG(scan) == is_par) {
4571                 next = regnext(scan);
4572
4573                 if ( next && (OP(next) != WHILEM) && next < last)
4574                     is_par = 0;         /* Disable optimization */
4575             }
4576             if (data)
4577                 *(data->last_closep) = ARG(scan);
4578         }
4579         else if (OP(scan) == EVAL) {
4580                 if (data)
4581                     data->flags |= SF_HAS_EVAL;
4582         }
4583         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4584             if (flags & SCF_DO_SUBSTR) {
4585                 SCAN_COMMIT(pRExC_state,data,minlenp);
4586                 flags &= ~SCF_DO_SUBSTR;
4587             }
4588             if (data && OP(scan)==ACCEPT) {
4589                 data->flags |= SCF_SEEN_ACCEPT;
4590                 if (stopmin > min)
4591                     stopmin = min;
4592             }
4593         }
4594         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4595         {
4596                 if (flags & SCF_DO_SUBSTR) {
4597                     SCAN_COMMIT(pRExC_state,data,minlenp);
4598                     data->longest = &(data->longest_float);
4599                 }
4600                 is_inf = is_inf_internal = 1;
4601                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4602                     cl_anything(pRExC_state, data->start_class);
4603                 flags &= ~SCF_DO_STCLASS;
4604         }
4605         else if (OP(scan) == GPOS) {
4606             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4607                 !(delta || is_inf || (data && data->pos_delta)))
4608             {
4609                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4610                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4611                 if (RExC_rx->gofs < (STRLEN)min)
4612                     RExC_rx->gofs = min;
4613             } else {
4614                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4615                 RExC_rx->gofs = 0;
4616             }
4617         }
4618 #ifdef TRIE_STUDY_OPT
4619 #ifdef FULL_TRIE_STUDY
4620         else if (PL_regkind[OP(scan)] == TRIE) {
4621             /* NOTE - There is similar code to this block above for handling
4622                BRANCH nodes on the initial study.  If you change stuff here
4623                check there too. */
4624             regnode *trie_node= scan;
4625             regnode *tail= regnext(scan);
4626             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4627             SSize_t max1 = 0, min1 = SSize_t_MAX;
4628             struct regnode_charclass_class accum;
4629
4630             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4631                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4632             if (flags & SCF_DO_STCLASS)
4633                 cl_init_zero(pRExC_state, &accum);
4634
4635             if (!trie->jump) {
4636                 min1= trie->minlen;
4637                 max1= trie->maxlen;
4638             } else {
4639                 const regnode *nextbranch= NULL;
4640                 U32 word;
4641
4642                 for ( word=1 ; word <= trie->wordcount ; word++)
4643                 {
4644                     SSize_t deltanext=0, minnext=0, f = 0, fake;
4645                     struct regnode_charclass_class this_class;
4646
4647                     data_fake.flags = 0;
4648                     if (data) {
4649                         data_fake.whilem_c = data->whilem_c;
4650                         data_fake.last_closep = data->last_closep;
4651                     }
4652                     else
4653                         data_fake.last_closep = &fake;
4654                     data_fake.pos_delta = delta;
4655                     if (flags & SCF_DO_STCLASS) {
4656                         cl_init(pRExC_state, &this_class);
4657                         data_fake.start_class = &this_class;
4658                         f = SCF_DO_STCLASS_AND;
4659                     }
4660                     if (flags & SCF_WHILEM_VISITED_POS)
4661                         f |= SCF_WHILEM_VISITED_POS;
4662
4663                     if (trie->jump[word]) {
4664                         if (!nextbranch)
4665                             nextbranch = trie_node + trie->jump[0];
4666                         scan= trie_node + trie->jump[word];
4667                         /* We go from the jump point to the branch that follows
4668                            it. Note this means we need the vestigal unused branches
4669                            even though they arent otherwise used.
4670                          */
4671                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4672                             &deltanext, (regnode *)nextbranch, &data_fake,
4673                             stopparen, recursed, NULL, f,depth+1);
4674                     }
4675                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4676                         nextbranch= regnext((regnode*)nextbranch);
4677
4678                     if (min1 > (SSize_t)(minnext + trie->minlen))
4679                         min1 = minnext + trie->minlen;
4680                     if (deltanext == SSize_t_MAX) {
4681                         is_inf = is_inf_internal = 1;
4682                         max1 = SSize_t_MAX;
4683                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4684                         max1 = minnext + deltanext + trie->maxlen;
4685
4686                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4687                         pars++;
4688                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4689                         if ( stopmin > min + min1)
4690                             stopmin = min + min1;
4691                         flags &= ~SCF_DO_SUBSTR;
4692                         if (data)
4693                             data->flags |= SCF_SEEN_ACCEPT;
4694                     }
4695                     if (data) {
4696                         if (data_fake.flags & SF_HAS_EVAL)
4697                             data->flags |= SF_HAS_EVAL;
4698                         data->whilem_c = data_fake.whilem_c;
4699                     }
4700                     if (flags & SCF_DO_STCLASS)
4701                         cl_or(pRExC_state, &accum, &this_class);
4702                 }
4703             }
4704             if (flags & SCF_DO_SUBSTR) {
4705                 data->pos_min += min1;
4706                 data->pos_delta += max1 - min1;
4707                 if (max1 != min1 || is_inf)
4708                     data->longest = &(data->longest_float);
4709             }
4710             min += min1;
4711             delta += max1 - min1;
4712             if (flags & SCF_DO_STCLASS_OR) {
4713                 cl_or(pRExC_state, data->start_class, &accum);
4714                 if (min1) {
4715                     cl_and(data->start_class, and_withp);
4716                     flags &= ~SCF_DO_STCLASS;
4717                 }
4718             }
4719             else if (flags & SCF_DO_STCLASS_AND) {
4720                 if (min1) {
4721                     cl_and(data->start_class, &accum);
4722                     flags &= ~SCF_DO_STCLASS;
4723                 }
4724                 else {
4725                     /* Switch to OR mode: cache the old value of
4726                      * data->start_class */
4727                     INIT_AND_WITHP;
4728                     StructCopy(data->start_class, and_withp,
4729                                struct regnode_charclass_class);
4730                     flags &= ~SCF_DO_STCLASS_AND;
4731                     StructCopy(&accum, data->start_class,
4732                                struct regnode_charclass_class);
4733                     flags |= SCF_DO_STCLASS_OR;
4734                     SET_SSC_EOS(data->start_class);
4735                 }
4736             }
4737             scan= tail;
4738             continue;
4739         }
4740 #else
4741         else if (PL_regkind[OP(scan)] == TRIE) {
4742             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4743             U8*bang=NULL;
4744
4745             min += trie->minlen;
4746             delta += (trie->maxlen - trie->minlen);
4747             flags &= ~SCF_DO_STCLASS; /* xxx */
4748             if (flags & SCF_DO_SUBSTR) {
4749                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4750                 data->pos_min += trie->minlen;
4751                 data->pos_delta += (trie->maxlen - trie->minlen);
4752                 if (trie->maxlen != trie->minlen)
4753                     data->longest = &(data->longest_float);
4754             }
4755             if (trie->jump) /* no more substrings -- for now /grr*/
4756                 flags &= ~SCF_DO_SUBSTR;
4757         }
4758 #endif /* old or new */
4759 #endif /* TRIE_STUDY_OPT */
4760
4761         /* Else: zero-length, ignore. */
4762         scan = regnext(scan);
4763     }
4764     if (frame) {
4765         last = frame->last;
4766         scan = frame->next;
4767         stopparen = frame->stop;
4768         frame = frame->prev;
4769         goto fake_study_recurse;
4770     }
4771
4772   finish:
4773     assert(!frame);
4774     DEBUG_STUDYDATA("pre-fin:",data,depth);
4775
4776     *scanp = scan;
4777     *deltap = is_inf_internal ? SSize_t_MAX : delta;
4778     if (flags & SCF_DO_SUBSTR && is_inf)
4779         data->pos_delta = SSize_t_MAX - data->pos_min;
4780     if (is_par > (I32)U8_MAX)
4781         is_par = 0;
4782     if (is_par && pars==1 && data) {
4783         data->flags |= SF_IN_PAR;
4784         data->flags &= ~SF_HAS_PAR;
4785     }
4786     else if (pars && data) {
4787         data->flags |= SF_HAS_PAR;
4788         data->flags &= ~SF_IN_PAR;
4789     }
4790     if (flags & SCF_DO_STCLASS_OR)
4791         cl_and(data->start_class, and_withp);
4792     if (flags & SCF_TRIE_RESTUDY)
4793         data->flags |=  SCF_TRIE_RESTUDY;
4794
4795     DEBUG_STUDYDATA("post-fin:",data,depth);
4796
4797     return min < stopmin ? min : stopmin;
4798 }
4799
4800 STATIC U32
4801 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4802 {
4803     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4804
4805     PERL_ARGS_ASSERT_ADD_DATA;
4806
4807     Renewc(RExC_rxi->data,
4808            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4809            char, struct reg_data);
4810     if(count)
4811         Renew(RExC_rxi->data->what, count + n, U8);
4812     else
4813         Newx(RExC_rxi->data->what, n, U8);
4814     RExC_rxi->data->count = count + n;
4815     Copy(s, RExC_rxi->data->what + count, n, U8);
4816     return count;
4817 }
4818
4819 /*XXX: todo make this not included in a non debugging perl */
4820 #ifndef PERL_IN_XSUB_RE
4821 void
4822 Perl_reginitcolors(pTHX)
4823 {
4824     dVAR;
4825     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4826     if (s) {
4827         char *t = savepv(s);
4828         int i = 0;
4829         PL_colors[0] = t;
4830         while (++i < 6) {
4831             t = strchr(t, '\t');
4832             if (t) {
4833                 *t = '\0';
4834                 PL_colors[i] = ++t;
4835             }
4836             else
4837                 PL_colors[i] = t = (char *)"";
4838         }
4839     } else {
4840         int i = 0;
4841         while (i < 6)
4842             PL_colors[i++] = (char *)"";
4843     }
4844     PL_colorset = 1;
4845 }
4846 #endif
4847
4848
4849 #ifdef TRIE_STUDY_OPT
4850 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4851     STMT_START {                                            \
4852         if (                                                \
4853               (data.flags & SCF_TRIE_RESTUDY)               \
4854               && ! restudied++                              \
4855         ) {                                                 \
4856             dOsomething;                                    \
4857             goto reStudy;                                   \
4858         }                                                   \
4859     } STMT_END
4860 #else
4861 #define CHECK_RESTUDY_GOTO_butfirst
4862 #endif
4863
4864 /*
4865  * pregcomp - compile a regular expression into internal code
4866  *
4867  * Decides which engine's compiler to call based on the hint currently in
4868  * scope
4869  */
4870
4871 #ifndef PERL_IN_XSUB_RE
4872
4873 /* return the currently in-scope regex engine (or the default if none)  */
4874
4875 regexp_engine const *
4876 Perl_current_re_engine(pTHX)
4877 {
4878     dVAR;
4879
4880     if (IN_PERL_COMPILETIME) {
4881         HV * const table = GvHV(PL_hintgv);
4882         SV **ptr;
4883
4884         if (!table)
4885             return &PL_core_reg_engine;
4886         ptr = hv_fetchs(table, "regcomp", FALSE);
4887         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4888             return &PL_core_reg_engine;
4889         return INT2PTR(regexp_engine*,SvIV(*ptr));
4890     }
4891     else {
4892         SV *ptr;
4893         if (!PL_curcop->cop_hints_hash)
4894             return &PL_core_reg_engine;
4895         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4896         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4897             return &PL_core_reg_engine;
4898         return INT2PTR(regexp_engine*,SvIV(ptr));
4899     }
4900 }
4901
4902
4903 REGEXP *
4904 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4905 {
4906     dVAR;
4907     regexp_engine const *eng = current_re_engine();
4908     GET_RE_DEBUG_FLAGS_DECL;
4909
4910     PERL_ARGS_ASSERT_PREGCOMP;
4911
4912     /* Dispatch a request to compile a regexp to correct regexp engine. */
4913     DEBUG_COMPILE_r({
4914         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4915                         PTR2UV(eng));
4916     });
4917     return CALLREGCOMP_ENG(eng, pattern, flags);
4918 }
4919 #endif
4920
4921 /* public(ish) entry point for the perl core's own regex compiling code.
4922  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4923  * pattern rather than a list of OPs, and uses the internal engine rather
4924  * than the current one */
4925
4926 REGEXP *
4927 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4928 {
4929     SV *pat = pattern; /* defeat constness! */
4930     PERL_ARGS_ASSERT_RE_COMPILE;
4931     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4932 #ifdef PERL_IN_XSUB_RE
4933                                 &my_reg_engine,
4934 #else
4935                                 &PL_core_reg_engine,
4936 #endif
4937                                 NULL, NULL, rx_flags, 0);
4938 }
4939
4940
4941 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4942  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4943  * point to the realloced string and length.
4944  *
4945  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4946  * stuff added */
4947
4948 static void
4949 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4950                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
4951 {
4952     U8 *const src = (U8*)*pat_p;
4953     U8 *dst;
4954     int n=0;
4955     STRLEN s = 0, d = 0;
4956     bool do_end = 0;
4957     GET_RE_DEBUG_FLAGS_DECL;
4958
4959     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4960         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4961
4962     Newx(dst, *plen_p * 2 + 1, U8);
4963
4964     while (s < *plen_p) {
4965         if (NATIVE_IS_INVARIANT(src[s]))
4966             dst[d]   = src[s];
4967         else {
4968             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
4969             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
4970         }
4971         if (n < num_code_blocks) {
4972             if (!do_end && pRExC_state->code_blocks[n].start == s) {
4973                 pRExC_state->code_blocks[n].start = d;
4974                 assert(dst[d] == '(');
4975                 do_end = 1;
4976             }
4977             else if (do_end && pRExC_state->code_blocks[n].end == s) {
4978                 pRExC_state->code_blocks[n].end = d;
4979                 assert(dst[d] == ')');
4980                 do_end = 0;
4981                 n++;
4982             }
4983         }
4984         s++;
4985         d++;
4986     }
4987     dst[d] = '\0';
4988     *plen_p = d;
4989     *pat_p = (char*) dst;
4990     SAVEFREEPV(*pat_p);
4991     RExC_orig_utf8 = RExC_utf8 = 1;
4992 }
4993
4994
4995
4996 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4997  * while recording any code block indices, and handling overloading,
4998  * nested qr// objects etc.  If pat is null, it will allocate a new
4999  * string, or just return the first arg, if there's only one.
5000  *
5001  * Returns the malloced/updated pat.
5002  * patternp and pat_count is the array of SVs to be concatted;
5003  * oplist is the optional list of ops that generated the SVs;
5004  * recompile_p is a pointer to a boolean that will be set if
5005  *   the regex will need to be recompiled.
5006  * delim, if non-null is an SV that will be inserted between each element
5007  */
5008
5009 static SV*
5010 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5011                 SV *pat, SV ** const patternp, int pat_count,
5012                 OP *oplist, bool *recompile_p, SV *delim)
5013 {
5014     SV **svp;
5015     int n = 0;
5016     bool use_delim = FALSE;
5017     bool alloced = FALSE;
5018
5019     /* if we know we have at least two args, create an empty string,
5020      * then concatenate args to that. For no args, return an empty string */
5021     if (!pat && pat_count != 1) {
5022         pat = newSVpvn("", 0);
5023         SAVEFREESV(pat);
5024         alloced = TRUE;
5025     }
5026
5027     for (svp = patternp; svp < patternp + pat_count; svp++) {
5028         SV *sv;
5029         SV *rx  = NULL;
5030         STRLEN orig_patlen = 0;
5031         bool code = 0;
5032         SV *msv = use_delim ? delim : *svp;
5033
5034         /* if we've got a delimiter, we go round the loop twice for each
5035          * svp slot (except the last), using the delimiter the second
5036          * time round */
5037         if (use_delim) {
5038             svp--;
5039             use_delim = FALSE;
5040         }
5041         else if (delim)
5042             use_delim = TRUE;
5043
5044         if (SvTYPE(msv) == SVt_PVAV) {
5045             /* we've encountered an interpolated array within
5046              * the pattern, e.g. /...@a..../. Expand the list of elements,
5047              * then recursively append elements.
5048              * The code in this block is based on S_pushav() */
5049
5050             AV *const av = (AV*)msv;
5051             const I32 maxarg = AvFILL(av) + 1;
5052             SV **array;
5053
5054             if (oplist) {
5055                 assert(oplist->op_type == OP_PADAV
5056                     || oplist->op_type == OP_RV2AV);
5057                 oplist = oplist->op_sibling;;
5058             }
5059
5060             if (SvRMAGICAL(av)) {
5061                 U32 i;
5062
5063                 Newx(array, maxarg, SV*);
5064                 SAVEFREEPV(array);
5065                 for (i=0; i < (U32)maxarg; i++) {
5066                     SV ** const svp = av_fetch(av, i, FALSE);
5067                     array[i] = svp ? *svp : &PL_sv_undef;
5068                 }
5069             }
5070             else
5071                 array = AvARRAY(av);
5072
5073             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5074                                 array, maxarg, NULL, recompile_p,
5075                                 /* $" */
5076                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5077
5078             continue;
5079         }
5080
5081
5082         /* we make the assumption here that each op in the list of
5083          * op_siblings maps to one SV pushed onto the stack,
5084          * except for code blocks, with have both an OP_NULL and
5085          * and OP_CONST.
5086          * This allows us to match up the list of SVs against the
5087          * list of OPs to find the next code block.
5088          *
5089          * Note that       PUSHMARK PADSV PADSV ..
5090          * is optimised to
5091          *                 PADRANGE PADSV  PADSV  ..
5092          * so the alignment still works. */
5093
5094         if (oplist) {
5095             if (oplist->op_type == OP_NULL
5096                 && (oplist->op_flags & OPf_SPECIAL))
5097             {
5098                 assert(n < pRExC_state->num_code_blocks);
5099                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5100                 pRExC_state->code_blocks[n].block = oplist;
5101                 pRExC_state->code_blocks[n].src_regex = NULL;
5102                 n++;
5103                 code = 1;
5104                 oplist = oplist->op_sibling; /* skip CONST */
5105                 assert(oplist);
5106             }
5107             oplist = oplist->op_sibling;;
5108         }
5109
5110         /* apply magic and QR overloading to arg */
5111
5112         SvGETMAGIC(msv);
5113         if (SvROK(msv) && SvAMAGIC(msv)) {
5114             SV *sv = AMG_CALLunary(msv, regexp_amg);
5115             if (sv) {
5116                 if (SvROK(sv))
5117                     sv = SvRV(sv);
5118                 if (SvTYPE(sv) != SVt_REGEXP)
5119                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5120                 msv = sv;
5121             }
5122         }
5123
5124         /* try concatenation overload ... */
5125         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5126                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5127         {
5128             sv_setsv(pat, sv);
5129             /* overloading involved: all bets are off over literal
5130              * code. Pretend we haven't seen it */
5131             pRExC_state->num_code_blocks -= n;
5132             n = 0;
5133         }
5134         else  {
5135             /* ... or failing that, try "" overload */
5136             while (SvAMAGIC(msv)
5137                     && (sv = AMG_CALLunary(msv, string_amg))
5138                     && sv != msv
5139                     &&  !(   SvROK(msv)
5140                           && SvROK(sv)
5141                           && SvRV(msv) == SvRV(sv))
5142             ) {
5143                 msv = sv;
5144                 SvGETMAGIC(msv);
5145             }
5146             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5147                 msv = SvRV(msv);
5148
5149             if (pat) {
5150                 /* this is a partially unrolled
5151                  *     sv_catsv_nomg(pat, msv);
5152                  * that allows us to adjust code block indices if
5153                  * needed */
5154                 STRLEN dlen;
5155                 char *dst = SvPV_force_nomg(pat, dlen);
5156                 orig_patlen = dlen;
5157                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5158                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5159                     sv_setpvn(pat, dst, dlen);
5160                     SvUTF8_on(pat);
5161                 }
5162                 sv_catsv_nomg(pat, msv);
5163                 rx = msv;
5164             }
5165             else
5166                 pat = msv;
5167
5168             if (code)
5169                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5170         }
5171
5172         /* extract any code blocks within any embedded qr//'s */
5173         if (rx && SvTYPE(rx) == SVt_REGEXP
5174             && RX_ENGINE((REGEXP*)rx)->op_comp)
5175         {
5176
5177             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5178             if (ri->num_code_blocks) {
5179                 int i;
5180                 /* the presence of an embedded qr// with code means
5181                  * we should always recompile: the text of the
5182                  * qr// may not have changed, but it may be a
5183                  * different closure than last time */
5184                 *recompile_p = 1;
5185                 Renew(pRExC_state->code_blocks,
5186                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5187                     struct reg_code_block);
5188                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5189
5190                 for (i=0; i < ri->num_code_blocks; i++) {
5191                     struct reg_code_block *src, *dst;
5192                     STRLEN offset =  orig_patlen
5193                         + ReANY((REGEXP *)rx)->pre_prefix;
5194                     assert(n < pRExC_state->num_code_blocks);
5195                     src = &ri->code_blocks[i];
5196                     dst = &pRExC_state->code_blocks[n];
5197                     dst->start      = src->start + offset;
5198                     dst->end        = src->end   + offset;
5199                     dst->block      = src->block;
5200                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5201                                             src->src_regex
5202                                                 ? src->src_regex
5203                                                 : (REGEXP*)rx);
5204                     n++;
5205                 }
5206             }
5207         }
5208     }
5209     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5210     if (alloced)
5211         SvSETMAGIC(pat);
5212
5213     return pat;
5214 }
5215
5216
5217
5218 /* see if there are any run-time code blocks in the pattern.
5219  * False positives are allowed */
5220
5221 static bool
5222 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5223                     char *pat, STRLEN plen)
5224 {
5225     int n = 0;
5226     STRLEN s;
5227
5228     for (s = 0; s < plen; s++) {
5229         if (n < pRExC_state->num_code_blocks
5230             && s == pRExC_state->code_blocks[n].start)
5231         {
5232             s = pRExC_state->code_blocks[n].end;
5233             n++;
5234             continue;
5235         }
5236         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5237          * positives here */
5238         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5239             (pat[s+2] == '{'
5240                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5241         )
5242             return 1;
5243     }
5244     return 0;
5245 }
5246
5247 /* Handle run-time code blocks. We will already have compiled any direct
5248  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5249  * copy of it, but with any literal code blocks blanked out and
5250  * appropriate chars escaped; then feed it into
5251  *
5252  *    eval "qr'modified_pattern'"
5253  *
5254  * For example,
5255  *
5256  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5257  *
5258  * becomes
5259  *
5260  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5261  *
5262  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5263  * and merge them with any code blocks of the original regexp.
5264  *
5265  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5266  * instead, just save the qr and return FALSE; this tells our caller that
5267  * the original pattern needs upgrading to utf8.
5268  */
5269
5270 static bool
5271 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5272     char *pat, STRLEN plen)
5273 {
5274     SV *qr;
5275
5276     GET_RE_DEBUG_FLAGS_DECL;
5277
5278     if (pRExC_state->runtime_code_qr) {
5279         /* this is the second time we've been called; this should
5280          * only happen if the main pattern got upgraded to utf8
5281          * during compilation; re-use the qr we compiled first time
5282          * round (which should be utf8 too)
5283          */
5284         qr = pRExC_state->runtime_code_qr;
5285         pRExC_state->runtime_code_qr = NULL;
5286         assert(RExC_utf8 && SvUTF8(qr));
5287     }
5288     else {
5289         int n = 0;
5290         STRLEN s;
5291         char *p, *newpat;
5292         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5293         SV *sv, *qr_ref;
5294         dSP;
5295
5296         /* determine how many extra chars we need for ' and \ escaping */
5297         for (s = 0; s < plen; s++) {
5298             if (pat[s] == '\'' || pat[s] == '\\')
5299                 newlen++;
5300         }
5301
5302         Newx(newpat, newlen, char);
5303         p = newpat;
5304         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5305
5306         for (s = 0; s < plen; s++) {
5307             if (n < pRExC_state->num_code_blocks
5308                 && s == pRExC_state->code_blocks[n].start)
5309             {
5310                 /* blank out literal code block */
5311                 assert(pat[s] == '(');
5312                 while (s <= pRExC_state->code_blocks[n].end) {
5313                     *p++ = '_';
5314                     s++;
5315                 }
5316                 s--;
5317                 n++;
5318                 continue;
5319             }
5320             if (pat[s] == '\'' || pat[s] == '\\')
5321                 *p++ = '\\';
5322             *p++ = pat[s];
5323         }
5324         *p++ = '\'';
5325         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5326             *p++ = 'x';
5327         *p++ = '\0';
5328         DEBUG_COMPILE_r({
5329             PerlIO_printf(Perl_debug_log,
5330                 "%sre-parsing pattern for runtime code:%s %s\n",
5331                 PL_colors[4],PL_colors[5],newpat);
5332         });
5333
5334         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5335         Safefree(newpat);
5336
5337         ENTER;
5338         SAVETMPS;
5339         save_re_context();
5340         PUSHSTACKi(PERLSI_REQUIRE);
5341         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5342          * parsing qr''; normally only q'' does this. It also alters
5343          * hints handling */
5344         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5345         SvREFCNT_dec_NN(sv);
5346         SPAGAIN;
5347         qr_ref = POPs;
5348         PUTBACK;
5349         {
5350             SV * const errsv = ERRSV;
5351             if (SvTRUE_NN(errsv))
5352             {
5353                 Safefree(pRExC_state->code_blocks);
5354                 /* use croak_sv ? */
5355                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5356             }
5357         }
5358         assert(SvROK(qr_ref));
5359         qr = SvRV(qr_ref);
5360         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5361         /* the leaving below frees the tmp qr_ref.
5362          * Give qr a life of its own */
5363         SvREFCNT_inc(qr);
5364         POPSTACK;
5365         FREETMPS;
5366         LEAVE;
5367
5368     }
5369
5370     if (!RExC_utf8 && SvUTF8(qr)) {
5371         /* first time through; the pattern got upgraded; save the
5372          * qr for the next time through */
5373         assert(!pRExC_state->runtime_code_qr);
5374         pRExC_state->runtime_code_qr = qr;
5375         return 0;
5376     }
5377
5378
5379     /* extract any code blocks within the returned qr//  */
5380
5381
5382     /* merge the main (r1) and run-time (r2) code blocks into one */
5383     {
5384         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5385         struct reg_code_block *new_block, *dst;
5386         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5387         int i1 = 0, i2 = 0;
5388
5389         if (!r2->num_code_blocks) /* we guessed wrong */
5390         {
5391             SvREFCNT_dec_NN(qr);
5392             return 1;
5393         }
5394
5395         Newx(new_block,
5396             r1->num_code_blocks + r2->num_code_blocks,
5397             struct reg_code_block);
5398         dst = new_block;
5399
5400         while (    i1 < r1->num_code_blocks
5401                 || i2 < r2->num_code_blocks)
5402         {
5403             struct reg_code_block *src;
5404             bool is_qr = 0;
5405
5406             if (i1 == r1->num_code_blocks) {
5407                 src = &r2->code_blocks[i2++];
5408                 is_qr = 1;
5409             }
5410             else if (i2 == r2->num_code_blocks)
5411                 src = &r1->code_blocks[i1++];
5412             else if (  r1->code_blocks[i1].start
5413                      < r2->code_blocks[i2].start)
5414             {
5415                 src = &r1->code_blocks[i1++];
5416                 assert(src->end < r2->code_blocks[i2].start);
5417             }
5418             else {
5419                 assert(  r1->code_blocks[i1].start
5420                        > r2->code_blocks[i2].start);
5421                 src = &r2->code_blocks[i2++];
5422                 is_qr = 1;
5423                 assert(src->end < r1->code_blocks[i1].start);
5424             }
5425
5426             assert(pat[src->start] == '(');
5427             assert(pat[src->end]   == ')');
5428             dst->start      = src->start;
5429             dst->end        = src->end;
5430             dst->block      = src->block;
5431             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5432                                     : src->src_regex;
5433             dst++;
5434         }
5435         r1->num_code_blocks += r2->num_code_blocks;
5436         Safefree(r1->code_blocks);
5437         r1->code_blocks = new_block;
5438     }
5439
5440     SvREFCNT_dec_NN(qr);
5441     return 1;
5442 }
5443
5444
5445 STATIC bool
5446 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5447                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5448 {
5449     /* This is the common code for setting up the floating and fixed length
5450      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5451      * as to whether succeeded or not */
5452
5453     I32 t;
5454     SSize_t ml;
5455
5456     if (! (longest_length
5457            || (eol /* Can't have SEOL and MULTI */
5458                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5459           )
5460             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5461         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5462     {
5463         return FALSE;
5464     }
5465
5466     /* copy the information about the longest from the reg_scan_data
5467         over to the program. */
5468     if (SvUTF8(sv_longest)) {
5469         *rx_utf8 = sv_longest;
5470         *rx_substr = NULL;
5471     } else {
5472         *rx_substr = sv_longest;
5473         *rx_utf8 = NULL;
5474     }
5475     /* end_shift is how many chars that must be matched that
5476         follow this item. We calculate it ahead of time as once the
5477         lookbehind offset is added in we lose the ability to correctly
5478         calculate it.*/
5479     ml = minlen ? *(minlen) : (SSize_t)longest_length;
5480     *rx_end_shift = ml - offset
5481         - longest_length + (SvTAIL(sv_longest) != 0)
5482         + lookbehind;
5483
5484     t = (eol/* Can't have SEOL and MULTI */
5485          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5486     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5487
5488     return TRUE;
5489 }
5490
5491 /*
5492  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5493  * regular expression into internal code.
5494  * The pattern may be passed either as:
5495  *    a list of SVs (patternp plus pat_count)
5496  *    a list of OPs (expr)
5497  * If both are passed, the SV list is used, but the OP list indicates
5498  * which SVs are actually pre-compiled code blocks
5499  *
5500  * The SVs in the list have magic and qr overloading applied to them (and
5501  * the list may be modified in-place with replacement SVs in the latter
5502  * case).
5503  *
5504  * If the pattern hasn't changed from old_re, then old_re will be
5505  * returned.
5506  *
5507  * eng is the current engine. If that engine has an op_comp method, then
5508  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5509  * do the initial concatenation of arguments and pass on to the external
5510  * engine.
5511  *
5512  * If is_bare_re is not null, set it to a boolean indicating whether the
5513  * arg list reduced (after overloading) to a single bare regex which has
5514  * been returned (i.e. /$qr/).
5515  *
5516  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5517  *
5518  * pm_flags contains the PMf_* flags, typically based on those from the
5519  * pm_flags field of the related PMOP. Currently we're only interested in
5520  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5521  *
5522  * We can't allocate space until we know how big the compiled form will be,
5523  * but we can't compile it (and thus know how big it is) until we've got a
5524  * place to put the code.  So we cheat:  we compile it twice, once with code
5525  * generation turned off and size counting turned on, and once "for real".
5526  * This also means that we don't allocate space until we are sure that the
5527  * thing really will compile successfully, and we never have to move the
5528  * code and thus invalidate pointers into it.  (Note that it has to be in
5529  * one piece because free() must be able to free it all.) [NB: not true in perl]
5530  *
5531  * Beware that the optimization-preparation code in here knows about some
5532  * of the structure of the compiled regexp.  [I'll say.]
5533  */
5534
5535 REGEXP *
5536 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5537                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
5538                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5539 {
5540     dVAR;
5541     REGEXP *rx;
5542     struct regexp *r;
5543     regexp_internal *ri;
5544     STRLEN plen;
5545     char *exp;
5546     regnode *scan;
5547     I32 flags;
5548     SSize_t minlen = 0;
5549     U32 rx_flags;
5550     SV *pat;
5551     SV *code_blocksv = NULL;
5552     SV** new_patternp = patternp;
5553
5554     /* these are all flags - maybe they should be turned
5555      * into a single int with different bit masks */
5556     I32 sawlookahead = 0;
5557     I32 sawplus = 0;
5558     I32 sawopen = 0;
5559     I32 sawminmod = 0;
5560
5561     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5562     bool recompile = 0;
5563     bool runtime_code = 0;
5564     scan_data_t data;
5565     RExC_state_t RExC_state;
5566     RExC_state_t * const pRExC_state = &RExC_state;
5567 #ifdef TRIE_STUDY_OPT
5568     int restudied = 0;
5569     RExC_state_t copyRExC_state;
5570 #endif
5571     GET_RE_DEBUG_FLAGS_DECL;
5572
5573     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5574
5575     DEBUG_r(if (!PL_colorset) reginitcolors());
5576
5577 #ifndef PERL_IN_XSUB_RE
5578     /* Initialize these here instead of as-needed, as is quick and avoids
5579      * having to test them each time otherwise */
5580     if (! PL_AboveLatin1) {
5581         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5582         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5583         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5584
5585         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5586                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5587         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5588                                 = _new_invlist_C_array(PosixAlnum_invlist);
5589
5590         PL_L1Posix_ptrs[_CC_ALPHA]
5591                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5592         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5593
5594         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5595         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5596
5597         /* Cased is the same as Alpha in the ASCII range */
5598         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5599         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5600
5601         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5602         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5603
5604         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5605         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5606
5607         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5608         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5609
5610         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5611         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5612
5613         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5614         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5615
5616         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5617         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5618
5619         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5620         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5621         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5622         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5623
5624         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5625         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5626
5627         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5628
5629         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5630         PL_L1Posix_ptrs[_CC_WORDCHAR]
5631                                 = _new_invlist_C_array(L1PosixWord_invlist);
5632
5633         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5634         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5635
5636         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5637     }
5638 #endif
5639
5640     pRExC_state->code_blocks = NULL;
5641     pRExC_state->num_code_blocks = 0;
5642
5643     if (is_bare_re)
5644         *is_bare_re = FALSE;
5645
5646     if (expr && (expr->op_type == OP_LIST ||
5647                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5648         /* allocate code_blocks if needed */
5649         OP *o;
5650         int ncode = 0;
5651
5652         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5653             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5654                 ncode++; /* count of DO blocks */
5655         if (ncode) {
5656             pRExC_state->num_code_blocks = ncode;
5657             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5658         }
5659     }
5660
5661     if (!pat_count) {
5662         /* compile-time pattern with just OP_CONSTs and DO blocks */
5663
5664         int n;
5665         OP *o;
5666
5667         /* find how many CONSTs there are */
5668         assert(expr);
5669         n = 0;
5670         if (expr->op_type == OP_CONST)
5671             n = 1;
5672         else
5673             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5674                 if (o->op_type == OP_CONST)
5675                     n++;
5676             }
5677
5678         /* fake up an SV array */
5679
5680         assert(!new_patternp);
5681         Newx(new_patternp, n, SV*);
5682         SAVEFREEPV(new_patternp);
5683         pat_count = n;
5684
5685         n = 0;
5686         if (expr->op_type == OP_CONST)
5687             new_patternp[n] = cSVOPx_sv(expr);
5688         else
5689             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5690                 if (o->op_type == OP_CONST)
5691                     new_patternp[n++] = cSVOPo_sv;
5692             }
5693
5694     }
5695
5696     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5697         "Assembling pattern from %d elements%s\n", pat_count,
5698             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5699
5700     /* set expr to the first arg op */
5701
5702     if (pRExC_state->num_code_blocks
5703          && expr->op_type != OP_CONST)
5704     {
5705             expr = cLISTOPx(expr)->op_first;
5706             assert(   expr->op_type == OP_PUSHMARK
5707                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5708                    || expr->op_type == OP_PADRANGE);
5709             expr = expr->op_sibling;
5710     }
5711
5712     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5713                         expr, &recompile, NULL);
5714
5715     /* handle bare (possibly after overloading) regex: foo =~ $re */
5716     {
5717         SV *re = pat;
5718         if (SvROK(re))
5719             re = SvRV(re);
5720         if (SvTYPE(re) == SVt_REGEXP) {
5721             if (is_bare_re)
5722                 *is_bare_re = TRUE;
5723             SvREFCNT_inc(re);
5724             Safefree(pRExC_state->code_blocks);
5725             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5726                 "Precompiled pattern%s\n",
5727                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5728
5729             return (REGEXP*)re;
5730         }
5731     }
5732
5733     exp = SvPV_nomg(pat, plen);
5734
5735     if (!eng->op_comp) {
5736         if ((SvUTF8(pat) && IN_BYTES)
5737                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5738         {
5739             /* make a temporary copy; either to convert to bytes,
5740              * or to avoid repeating get-magic / overloaded stringify */
5741             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5742                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5743         }
5744         Safefree(pRExC_state->code_blocks);
5745         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5746     }
5747
5748     /* ignore the utf8ness if the pattern is 0 length */
5749     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5750     RExC_uni_semantics = 0;
5751     RExC_contains_locale = 0;
5752     pRExC_state->runtime_code_qr = NULL;
5753
5754     DEBUG_COMPILE_r({
5755             SV *dsv= sv_newmortal();
5756             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5757             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5758                           PL_colors[4],PL_colors[5],s);
5759         });
5760
5761   redo_first_pass:
5762     /* we jump here if we upgrade the pattern to utf8 and have to
5763      * recompile */
5764
5765     if ((pm_flags & PMf_USE_RE_EVAL)
5766                 /* this second condition covers the non-regex literal case,
5767                  * i.e.  $foo =~ '(?{})'. */
5768                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5769     )
5770         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5771
5772     /* return old regex if pattern hasn't changed */
5773     /* XXX: note in the below we have to check the flags as well as the pattern.
5774      *
5775      * Things get a touch tricky as we have to compare the utf8 flag independently
5776      * from the compile flags.
5777      */
5778
5779     if (   old_re
5780         && !recompile
5781         && !!RX_UTF8(old_re) == !!RExC_utf8
5782         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5783         && RX_PRECOMP(old_re)
5784         && RX_PRELEN(old_re) == plen
5785         && memEQ(RX_PRECOMP(old_re), exp, plen)
5786         && !runtime_code /* with runtime code, always recompile */ )
5787     {
5788         Safefree(pRExC_state->code_blocks);
5789         return old_re;
5790     }
5791
5792     rx_flags = orig_rx_flags;
5793
5794     if (initial_charset == REGEX_LOCALE_CHARSET) {
5795         RExC_contains_locale = 1;
5796     }
5797     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5798
5799         /* Set to use unicode semantics if the pattern is in utf8 and has the
5800          * 'depends' charset specified, as it means unicode when utf8  */
5801         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5802     }
5803
5804     RExC_precomp = exp;
5805     RExC_flags = rx_flags;
5806     RExC_pm_flags = pm_flags;
5807
5808     if (runtime_code) {
5809         if (TAINTING_get && TAINT_get)
5810             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5811
5812         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5813             /* whoops, we have a non-utf8 pattern, whilst run-time code
5814              * got compiled as utf8. Try again with a utf8 pattern */
5815             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5816                                     pRExC_state->num_code_blocks);
5817             goto redo_first_pass;
5818         }
5819     }
5820     assert(!pRExC_state->runtime_code_qr);
5821
5822     RExC_sawback = 0;
5823
5824     RExC_seen = 0;
5825     RExC_in_lookbehind = 0;
5826     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5827     RExC_extralen = 0;
5828     RExC_override_recoding = 0;
5829     RExC_in_multi_char_class = 0;
5830
5831     /* First pass: determine size, legality. */
5832     RExC_parse = exp;
5833     RExC_start = exp;
5834     RExC_end = exp + plen;
5835     RExC_naughty = 0;
5836     RExC_npar = 1;
5837     RExC_nestroot = 0;
5838     RExC_size = 0L;
5839     RExC_emit = &RExC_emit_dummy;
5840     RExC_whilem_seen = 0;
5841     RExC_open_parens = NULL;
5842     RExC_close_parens = NULL;
5843     RExC_opend = NULL;
5844     RExC_paren_names = NULL;
5845 #ifdef DEBUGGING
5846     RExC_paren_name_list = NULL;
5847 #endif
5848     RExC_recurse = NULL;
5849     RExC_recurse_count = 0;
5850     pRExC_state->code_index = 0;
5851
5852 #if 0 /* REGC() is (currently) a NOP at the first pass.
5853        * Clever compilers notice this and complain. --jhi */
5854     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5855 #endif
5856     DEBUG_PARSE_r(
5857         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5858         RExC_lastnum=0;
5859         RExC_lastparse=NULL;
5860     );
5861     /* reg may croak on us, not giving us a chance to free
5862        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5863        need it to survive as long as the regexp (qr/(?{})/).
5864        We must check that code_blocksv is not already set, because we may
5865        have jumped back to restart the sizing pass. */
5866     if (pRExC_state->code_blocks && !code_blocksv) {
5867         code_blocksv = newSV_type(SVt_PV);
5868         SAVEFREESV(code_blocksv);
5869         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5870         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5871     }
5872     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5873         /* It's possible to write a regexp in ascii that represents Unicode
5874         codepoints outside of the byte range, such as via \x{100}. If we
5875         detect such a sequence we have to convert the entire pattern to utf8
5876         and then recompile, as our sizing calculation will have been based
5877         on 1 byte == 1 character, but we will need to use utf8 to encode
5878         at least some part of the pattern, and therefore must convert the whole
5879         thing.
5880         -- dmq */
5881         if (flags & RESTART_UTF8) {
5882             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5883                                     pRExC_state->num_code_blocks);
5884             goto redo_first_pass;
5885         }
5886         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5887     }
5888     if (code_blocksv)
5889         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5890
5891     DEBUG_PARSE_r({
5892         PerlIO_printf(Perl_debug_log,
5893             "Required size %"IVdf" nodes\n"
5894             "Starting second pass (creation)\n",
5895             (IV)RExC_size);
5896         RExC_lastnum=0;
5897         RExC_lastparse=NULL;
5898     });
5899
5900     /* The first pass could have found things that force Unicode semantics */
5901     if ((RExC_utf8 || RExC_uni_semantics)
5902          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5903     {
5904         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5905     }
5906
5907     /* Small enough for pointer-storage convention?
5908        If extralen==0, this means that we will not need long jumps. */
5909     if (RExC_size >= 0x10000L && RExC_extralen)
5910         RExC_size += RExC_extralen;
5911     else
5912         RExC_extralen = 0;
5913     if (RExC_whilem_seen > 15)
5914         RExC_whilem_seen = 15;
5915
5916     /* Allocate space and zero-initialize. Note, the two step process
5917        of zeroing when in debug mode, thus anything assigned has to
5918        happen after that */
5919     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5920     r = ReANY(rx);
5921     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5922          char, regexp_internal);
5923     if ( r == NULL || ri == NULL )
5924         FAIL("Regexp out of space");
5925 #ifdef DEBUGGING
5926     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5927     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5928 #else
5929     /* bulk initialize base fields with 0. */
5930     Zero(ri, sizeof(regexp_internal), char);
5931 #endif
5932
5933     /* non-zero initialization begins here */
5934     RXi_SET( r, ri );
5935     r->engine= eng;
5936     r->extflags = rx_flags;
5937     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5938
5939     if (pm_flags & PMf_IS_QR) {
5940         ri->code_blocks = pRExC_state->code_blocks;
5941         ri->num_code_blocks = pRExC_state->num_code_blocks;
5942     }
5943     else
5944     {
5945         int n;
5946         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5947             if (pRExC_state->code_blocks[n].src_regex)
5948                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5949         SAVEFREEPV(pRExC_state->code_blocks);
5950     }
5951
5952     {
5953         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5954         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5955
5956         /* The caret is output if there are any defaults: if not all the STD
5957          * flags are set, or if no character set specifier is needed */
5958         bool has_default =
5959                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5960                     || ! has_charset);
5961         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5962         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5963                             >> RXf_PMf_STD_PMMOD_SHIFT);
5964         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5965         char *p;
5966         /* Allocate for the worst case, which is all the std flags are turned
5967          * on.  If more precision is desired, we could do a population count of
5968          * the flags set.  This could be done with a small lookup table, or by
5969          * shifting, masking and adding, or even, when available, assembly
5970          * language for a machine-language population count.
5971          * We never output a minus, as all those are defaults, so are
5972          * covered by the caret */
5973         const STRLEN wraplen = plen + has_p + has_runon
5974             + has_default       /* If needs a caret */
5975
5976                 /* If needs a character set specifier */
5977             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5978             + (sizeof(STD_PAT_MODS) - 1)
5979             + (sizeof("(?:)") - 1);
5980
5981         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5982         r->xpv_len_u.xpvlenu_pv = p;
5983         if (RExC_utf8)
5984             SvFLAGS(rx) |= SVf_UTF8;
5985         *p++='('; *p++='?';
5986
5987         /* If a default, cover it using the caret */
5988         if (has_default) {
5989             *p++= DEFAULT_PAT_MOD;
5990         }
5991         if (has_charset) {
5992             STRLEN len;
5993             const char* const name = get_regex_charset_name(r->extflags, &len);
5994             Copy(name, p, len, char);
5995             p += len;
5996         }
5997         if (has_p)
5998             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5999         {
6000             char ch;
6001             while((ch = *fptr++)) {
6002                 if(reganch & 1)
6003                     *p++ = ch;
6004                 reganch >>= 1;
6005             }
6006         }
6007
6008         *p++ = ':';
6009         Copy(RExC_precomp, p, plen, char);
6010         assert ((RX_WRAPPED(rx) - p) < 16);
6011         r->pre_prefix = p - RX_WRAPPED(rx);
6012         p += plen;
6013         if (has_runon)
6014             *p++ = '\n';
6015         *p++ = ')';
6016         *p = 0;
6017         SvCUR_set(rx, p - RX_WRAPPED(rx));
6018     }
6019
6020     r->intflags = 0;
6021     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6022
6023     if (RExC_seen & REG_SEEN_RECURSE) {
6024         Newxz(RExC_open_parens, RExC_npar,regnode *);
6025         SAVEFREEPV(RExC_open_parens);
6026         Newxz(RExC_close_parens,RExC_npar,regnode *);
6027         SAVEFREEPV(RExC_close_parens);
6028     }
6029
6030     /* Useful during FAIL. */
6031 #ifdef RE_TRACK_PATTERN_OFFSETS
6032     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6033     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6034                           "%s %"UVuf" bytes for offset annotations.\n",
6035                           ri->u.offsets ? "Got" : "Couldn't get",
6036                           (UV)((2*RExC_size+1) * sizeof(U32))));
6037 #endif
6038     SetProgLen(ri,RExC_size);
6039     RExC_rx_sv = rx;
6040     RExC_rx = r;
6041     RExC_rxi = ri;
6042
6043     /* Second pass: emit code. */
6044     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6045     RExC_pm_flags = pm_flags;
6046     RExC_parse = exp;
6047     RExC_end = exp + plen;
6048     RExC_naughty = 0;
6049     RExC_npar = 1;
6050     RExC_emit_start = ri->program;
6051     RExC_emit = ri->program;
6052     RExC_emit_bound = ri->program + RExC_size + 1;
6053     pRExC_state->code_index = 0;
6054
6055     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6056     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6057         ReREFCNT_dec(rx);
6058         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6059     }
6060     /* XXXX To minimize changes to RE engine we always allocate
6061        3-units-long substrs field. */
6062     Newx(r->substrs, 1, struct reg_substr_data);
6063     if (RExC_recurse_count) {
6064         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6065         SAVEFREEPV(RExC_recurse);
6066     }
6067
6068 reStudy:
6069     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6070     Zero(r->substrs, 1, struct reg_substr_data);
6071
6072 #ifdef TRIE_STUDY_OPT
6073     if (!restudied) {
6074         StructCopy(&zero_scan_data, &data, scan_data_t);
6075         copyRExC_state = RExC_state;
6076     } else {
6077         U32 seen=RExC_seen;
6078         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6079
6080         RExC_state = copyRExC_state;
6081         if (seen & REG_TOP_LEVEL_BRANCHES)
6082             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6083         else
6084             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6085         StructCopy(&zero_scan_data, &data, scan_data_t);
6086     }
6087 #else
6088     StructCopy(&zero_scan_data, &data, scan_data_t);
6089 #endif
6090
6091     /* Dig out information for optimizations. */
6092     r->extflags = RExC_flags; /* was pm_op */
6093     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6094
6095     if (UTF)
6096         SvUTF8_on(rx);  /* Unicode in it? */
6097     ri->regstclass = NULL;
6098     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6099         r->intflags |= PREGf_NAUGHTY;
6100     scan = ri->program + 1;             /* First BRANCH. */
6101
6102     /* testing for BRANCH here tells us whether there is "must appear"
6103        data in the pattern. If there is then we can use it for optimisations */
6104     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6105         SSize_t fake;
6106         STRLEN longest_float_length, longest_fixed_length;
6107         struct regnode_charclass_class ch_class; /* pointed to by data */
6108         int stclass_flag;
6109         SSize_t last_close = 0; /* pointed to by data */
6110         regnode *first= scan;
6111         regnode *first_next= regnext(first);
6112         /*
6113          * Skip introductions and multiplicators >= 1
6114          * so that we can extract the 'meat' of the pattern that must
6115          * match in the large if() sequence following.
6116          * NOTE that EXACT is NOT covered here, as it is normally
6117          * picked up by the optimiser separately.
6118          *
6119          * This is unfortunate as the optimiser isnt handling lookahead
6120          * properly currently.
6121          *
6122          */
6123         while ((OP(first) == OPEN && (sawopen = 1)) ||
6124                /* An OR of *one* alternative - should not happen now. */
6125             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6126             /* for now we can't handle lookbehind IFMATCH*/
6127             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6128             (OP(first) == PLUS) ||
6129             (OP(first) == MINMOD) ||
6130                /* An {n,m} with n>0 */
6131             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6132             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6133         {
6134                 /*
6135                  * the only op that could be a regnode is PLUS, all the rest
6136                  * will be regnode_1 or regnode_2.
6137                  *
6138                  * (yves doesn't think this is true)
6139                  */
6140                 if (OP(first) == PLUS)
6141                     sawplus = 1;
6142                 else {
6143                     if (OP(first) == MINMOD)
6144                         sawminmod = 1;
6145                     first += regarglen[OP(first)];
6146                 }
6147                 first = NEXTOPER(first);
6148                 first_next= regnext(first);
6149         }
6150
6151         /* Starting-point info. */
6152       again:
6153         DEBUG_PEEP("first:",first,0);
6154         /* Ignore EXACT as we deal with it later. */
6155         if (PL_regkind[OP(first)] == EXACT) {
6156             if (OP(first) == EXACT)
6157                 NOOP;   /* Empty, get anchored substr later. */
6158             else
6159                 ri->regstclass = first;
6160         }
6161 #ifdef TRIE_STCLASS
6162         else if (PL_regkind[OP(first)] == TRIE &&
6163                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6164         {
6165             regnode *trie_op;
6166             /* this can happen only on restudy */
6167             if ( OP(first) == TRIE ) {
6168                 struct regnode_1 *trieop = (struct regnode_1 *)
6169                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6170                 StructCopy(first,trieop,struct regnode_1);
6171                 trie_op=(regnode *)trieop;
6172             } else {
6173                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6174                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6175                 StructCopy(first,trieop,struct regnode_charclass);
6176                 trie_op=(regnode *)trieop;
6177             }
6178             OP(trie_op)+=2;
6179             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6180             ri->regstclass = trie_op;
6181         }
6182 #endif
6183         else if (REGNODE_SIMPLE(OP(first)))
6184             ri->regstclass = first;
6185         else if (PL_regkind[OP(first)] == BOUND ||
6186                  PL_regkind[OP(first)] == NBOUND)
6187             ri->regstclass = first;
6188         else if (PL_regkind[OP(first)] == BOL) {
6189             r->extflags |= (OP(first) == MBOL
6190                            ? RXf_ANCH_MBOL
6191                            : (OP(first) == SBOL
6192                               ? RXf_ANCH_SBOL
6193                               : RXf_ANCH_BOL));
6194             first = NEXTOPER(first);
6195             goto again;
6196         }
6197         else if (OP(first) == GPOS) {
6198             r->extflags |= RXf_ANCH_GPOS;
6199             first = NEXTOPER(first);
6200             goto again;
6201         }
6202         else if ((!sawopen || !RExC_sawback) &&
6203             (OP(first) == STAR &&
6204             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6205             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6206         {
6207             /* turn .* into ^.* with an implied $*=1 */
6208             const int type =
6209                 (OP(NEXTOPER(first)) == REG_ANY)
6210                     ? RXf_ANCH_MBOL
6211                     : RXf_ANCH_SBOL;
6212             r->extflags |= type;
6213             r->intflags |= PREGf_IMPLICIT;
6214             first = NEXTOPER(first);
6215             goto again;
6216         }
6217         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6218             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6219             /* x+ must match at the 1st pos of run of x's */
6220             r->intflags |= PREGf_SKIP;
6221
6222         /* Scan is after the zeroth branch, first is atomic matcher. */
6223 #ifdef TRIE_STUDY_OPT
6224         DEBUG_PARSE_r(
6225             if (!restudied)
6226                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6227                               (IV)(first - scan + 1))
6228         );
6229 #else
6230         DEBUG_PARSE_r(
6231             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6232                 (IV)(first - scan + 1))
6233         );
6234 #endif
6235
6236
6237         /*
6238         * If there's something expensive in the r.e., find the
6239         * longest literal string that must appear and make it the
6240         * regmust.  Resolve ties in favor of later strings, since
6241         * the regstart check works with the beginning of the r.e.
6242         * and avoiding duplication strengthens checking.  Not a
6243         * strong reason, but sufficient in the absence of others.
6244         * [Now we resolve ties in favor of the earlier string if
6245         * it happens that c_offset_min has been invalidated, since the
6246         * earlier string may buy us something the later one won't.]
6247         */
6248
6249         data.longest_fixed = newSVpvs("");
6250         data.longest_float = newSVpvs("");
6251         data.last_found = newSVpvs("");
6252         data.longest = &(data.longest_fixed);
6253         ENTER_with_name("study_chunk");
6254         SAVEFREESV(data.longest_fixed);
6255         SAVEFREESV(data.longest_float);
6256         SAVEFREESV(data.last_found);
6257         first = scan;
6258         if (!ri->regstclass) {
6259             cl_init(pRExC_state, &ch_class);
6260             data.start_class = &ch_class;
6261             stclass_flag = SCF_DO_STCLASS_AND;
6262         } else                          /* XXXX Check for BOUND? */
6263             stclass_flag = 0;
6264         data.last_closep = &last_close;
6265
6266         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6267             &data, -1, NULL, NULL,
6268             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6269                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6270             0);
6271
6272
6273         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6274
6275
6276         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6277              && data.last_start_min == 0 && data.last_end > 0
6278              && !RExC_seen_zerolen
6279              && !(RExC_seen & REG_SEEN_VERBARG)
6280              && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6281             r->extflags |= RXf_CHECK_ALL;
6282         scan_commit(pRExC_state, &data,&minlen,0);
6283
6284         longest_float_length = CHR_SVLEN(data.longest_float);
6285
6286         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6287                    && data.offset_fixed == data.offset_float_min
6288                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6289             && S_setup_longest (aTHX_ pRExC_state,
6290                                     data.longest_float,
6291                                     &(r->float_utf8),
6292                                     &(r->float_substr),
6293                                     &(r->float_end_shift),
6294                                     data.lookbehind_float,
6295                                     data.offset_float_min,
6296                                     data.minlen_float,
6297                                     longest_float_length,
6298                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6299                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6300         {
6301             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6302             r->float_max_offset = data.offset_float_max;
6303             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6304                 r->float_max_offset -= data.lookbehind_float;
6305             SvREFCNT_inc_simple_void_NN(data.longest_float);
6306         }
6307         else {
6308             r->float_substr = r->float_utf8 = NULL;
6309             longest_float_length = 0;
6310         }
6311
6312         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6313
6314         if (S_setup_longest (aTHX_ pRExC_state,
6315                                 data.longest_fixed,
6316                                 &(r->anchored_utf8),
6317                                 &(r->anchored_substr),
6318                                 &(r->anchored_end_shift),
6319                                 data.lookbehind_fixed,
6320                                 data.offset_fixed,
6321                                 data.minlen_fixed,
6322                                 longest_fixed_length,
6323                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6324                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6325         {
6326             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6327             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6328         }
6329         else {
6330             r->anchored_substr = r->anchored_utf8 = NULL;
6331             longest_fixed_length = 0;
6332         }
6333         LEAVE_with_name("study_chunk");
6334
6335         if (ri->regstclass
6336             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6337             ri->regstclass = NULL;
6338
6339         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6340             && stclass_flag
6341             && ! TEST_SSC_EOS(data.start_class)
6342             && !cl_is_anything(data.start_class))
6343         {
6344             const U32 n = add_data(pRExC_state, 1, "f");
6345             OP(data.start_class) = ANYOF_SYNTHETIC;
6346
6347             Newx(RExC_rxi->data->data[n], 1,
6348                 struct regnode_charclass_class);
6349             StructCopy(data.start_class,
6350                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6351                        struct regnode_charclass_class);
6352             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6353             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6354             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6355                       regprop(r, sv, (regnode*)data.start_class);
6356                       PerlIO_printf(Perl_debug_log,
6357                                     "synthetic stclass \"%s\".\n",
6358                                     SvPVX_const(sv));});
6359         }
6360
6361         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6362         if (longest_fixed_length > longest_float_length) {
6363             r->check_end_shift = r->anchored_end_shift;
6364             r->check_substr = r->anchored_substr;
6365             r->check_utf8 = r->anchored_utf8;
6366             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6367             if (r->extflags & RXf_ANCH_SINGLE)
6368                 r->extflags |= RXf_NOSCAN;
6369         }
6370         else {
6371             r->check_end_shift = r->float_end_shift;
6372             r->check_substr = r->float_substr;
6373             r->check_utf8 = r->float_utf8;
6374             r->check_offset_min = r->float_min_offset;
6375             r->check_offset_max = r->float_max_offset;
6376         }
6377         if ((r->check_substr || r->check_utf8) ) {
6378             r->extflags |= RXf_USE_INTUIT;
6379             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6380                 r->extflags |= RXf_INTUIT_TAIL;
6381         }
6382         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6383         if ( (STRLEN)minlen < longest_float_length )
6384             minlen= longest_float_length;
6385         if ( (STRLEN)minlen < longest_fixed_length )
6386             minlen= longest_fixed_length;
6387         */
6388     }
6389     else {
6390         /* Several toplevels. Best we can is to set minlen. */
6391         SSize_t fake;
6392         struct regnode_charclass_class ch_class;
6393         SSize_t last_close = 0;
6394
6395         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6396
6397         scan = ri->program + 1;
6398         cl_init(pRExC_state, &ch_class);
6399         data.start_class = &ch_class;
6400         data.last_closep = &last_close;
6401
6402
6403         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6404             &data, -1, NULL, NULL,
6405             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6406                               |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6407             0);
6408
6409         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6410
6411         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6412                 = r->float_substr = r->float_utf8 = NULL;
6413
6414         if (! TEST_SSC_EOS(data.start_class)
6415             && !cl_is_anything(data.start_class))
6416         {
6417             const U32 n = add_data(pRExC_state, 1, "f");
6418             OP(data.start_class) = ANYOF_SYNTHETIC;
6419
6420             Newx(RExC_rxi->data->data[n], 1,
6421                 struct regnode_charclass_class);
6422             StructCopy(data.start_class,
6423                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6424                        struct regnode_charclass_class);
6425             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6426             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6427             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6428                       regprop(r, sv, (regnode*)data.start_class);
6429                       PerlIO_printf(Perl_debug_log,
6430                                     "synthetic stclass \"%s\".\n",
6431                                     SvPVX_const(sv));});
6432         }
6433     }
6434
6435     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6436        the "real" pattern. */
6437     DEBUG_OPTIMISE_r({
6438         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6439                       (IV)minlen, (IV)r->minlen);
6440     });
6441     r->minlenret = minlen;
6442     if (r->minlen < minlen)
6443         r->minlen = minlen;
6444
6445     if (RExC_seen & REG_SEEN_GPOS)
6446         r->extflags |= RXf_GPOS_SEEN;
6447     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6448         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6449     if (pRExC_state->num_code_blocks)
6450         r->extflags |= RXf_EVAL_SEEN;
6451     if (RExC_seen & REG_SEEN_CANY)
6452         r->extflags |= RXf_CANY_SEEN;
6453     if (RExC_seen & REG_SEEN_VERBARG)
6454     {
6455         r->intflags |= PREGf_VERBARG_SEEN;
6456         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6457     }
6458     if (RExC_seen & REG_SEEN_CUTGROUP)
6459         r->intflags |= PREGf_CUTGROUP_SEEN;
6460     if (pm_flags & PMf_USE_RE_EVAL)
6461         r->intflags |= PREGf_USE_RE_EVAL;
6462     if (RExC_paren_names)
6463         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6464     else
6465         RXp_PAREN_NAMES(r) = NULL;
6466
6467     {
6468         regnode *first = ri->program + 1;
6469         U8 fop = OP(first);
6470         regnode *next = NEXTOPER(first);
6471         U8 nop = OP(next);
6472
6473         if (PL_regkind[fop] == NOTHING && nop == END)
6474             r->extflags |= RXf_NULL;
6475         else if (PL_regkind[fop] == BOL && nop == END)
6476             r->extflags |= RXf_START_ONLY;
6477         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6478             r->extflags |= RXf_WHITE;
6479         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6480             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6481
6482     }
6483 #ifdef DEBUGGING
6484     if (RExC_paren_names) {
6485         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6486         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6487     } else
6488 #endif
6489         ri->name_list_idx = 0;
6490
6491     if (RExC_recurse_count) {
6492         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6493             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6494             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6495         }
6496     }
6497     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6498     /* assume we don't need to swap parens around before we match */
6499
6500     DEBUG_DUMP_r({
6501         PerlIO_printf(Perl_debug_log,"Final program:\n");
6502         regdump(r);
6503     });
6504 #ifdef RE_TRACK_PATTERN_OFFSETS
6505     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6506         const STRLEN len = ri->u.offsets[0];
6507         STRLEN i;
6508         GET_RE_DEBUG_FLAGS_DECL;
6509         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6510         for (i = 1; i <= len; i++) {
6511             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6512                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6513                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6514             }
6515         PerlIO_printf(Perl_debug_log, "\n");
6516     });
6517 #endif
6518
6519 #ifdef USE_ITHREADS
6520     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6521      * by setting the regexp SV to readonly-only instead. If the
6522      * pattern's been recompiled, the USEDness should remain. */
6523     if (old_re && SvREADONLY(old_re))
6524         SvREADONLY_on(rx);
6525 #endif
6526     return rx;
6527 }
6528
6529
6530 SV*
6531 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6532                     const U32 flags)
6533 {
6534     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6535
6536     PERL_UNUSED_ARG(value);
6537
6538     if (flags & RXapif_FETCH) {
6539         return reg_named_buff_fetch(rx, key, flags);
6540     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6541         Perl_croak_no_modify();
6542         return NULL;
6543     } else if (flags & RXapif_EXISTS) {
6544         return reg_named_buff_exists(rx, key, flags)
6545             ? &PL_sv_yes
6546             : &PL_sv_no;
6547     } else if (flags & RXapif_REGNAMES) {
6548         return reg_named_buff_all(rx, flags);
6549     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6550         return reg_named_buff_scalar(rx, flags);
6551     } else {
6552         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6553         return NULL;
6554     }
6555 }
6556
6557 SV*
6558 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6559                          const U32 flags)
6560 {
6561     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6562     PERL_UNUSED_ARG(lastkey);
6563
6564     if (flags & RXapif_FIRSTKEY)
6565         return reg_named_buff_firstkey(rx, flags);
6566     else if (flags & RXapif_NEXTKEY)
6567         return reg_named_buff_nextkey(rx, flags);
6568     else {
6569         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6570         return NULL;
6571     }
6572 }
6573
6574 SV*
6575 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6576                           const U32 flags)
6577 {
6578     AV *retarray = NULL;
6579     SV *ret;
6580     struct regexp *const rx = ReANY(r);
6581
6582     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6583
6584     if (flags & RXapif_ALL)
6585         retarray=newAV();
6586
6587     if (rx && RXp_PAREN_NAMES(rx)) {
6588         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6589         if (he_str) {
6590             IV i;
6591             SV* sv_dat=HeVAL(he_str);
6592             I32 *nums=(I32*)SvPVX(sv_dat);
6593             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6594                 if ((I32)(rx->nparens) >= nums[i]
6595                     && rx->offs[nums[i]].start != -1
6596                     && rx->offs[nums[i]].end != -1)
6597                 {
6598                     ret = newSVpvs("");
6599                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6600                     if (!retarray)
6601                         return ret;
6602                 } else {
6603                     if (retarray)
6604                         ret = newSVsv(&PL_sv_undef);
6605                 }
6606                 if (retarray)
6607                     av_push(retarray, ret);
6608             }
6609             if (retarray)
6610                 return newRV_noinc(MUTABLE_SV(retarray));
6611         }
6612     }
6613     return NULL;
6614 }
6615
6616 bool
6617 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6618                            const U32 flags)
6619 {
6620     struct regexp *const rx = ReANY(r);
6621
6622     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6623
6624     if (rx && RXp_PAREN_NAMES(rx)) {
6625         if (flags & RXapif_ALL) {
6626             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6627         } else {
6628             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6629             if (sv) {
6630                 SvREFCNT_dec_NN(sv);
6631                 return TRUE;
6632             } else {
6633                 return FALSE;
6634             }
6635         }
6636     } else {
6637         return FALSE;
6638     }
6639 }
6640
6641 SV*
6642 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6643 {
6644     struct regexp *const rx = ReANY(r);
6645
6646     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6647
6648     if ( rx && RXp_PAREN_NAMES(rx) ) {
6649         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6650
6651         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6652     } else {
6653         return FALSE;
6654     }
6655 }
6656
6657 SV*
6658 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6659 {
6660     struct regexp *const rx = ReANY(r);
6661     GET_RE_DEBUG_FLAGS_DECL;
6662
6663     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6664
6665     if (rx && RXp_PAREN_NAMES(rx)) {
6666         HV *hv = RXp_PAREN_NAMES(rx);
6667         HE *temphe;
6668         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6669             IV i;
6670             IV parno = 0;
6671             SV* sv_dat = HeVAL(temphe);
6672             I32 *nums = (I32*)SvPVX(sv_dat);
6673             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6674                 if ((I32)(rx->lastparen) >= nums[i] &&
6675                     rx->offs[nums[i]].start != -1 &&
6676                     rx->offs[nums[i]].end != -1)
6677                 {
6678                     parno = nums[i];
6679                     break;
6680                 }
6681             }
6682             if (parno || flags & RXapif_ALL) {
6683                 return newSVhek(HeKEY_hek(temphe));
6684             }
6685         }
6686     }
6687     return NULL;
6688 }
6689
6690 SV*
6691 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6692 {
6693     SV *ret;
6694     AV *av;
6695     SSize_t length;
6696     struct regexp *const rx = ReANY(r);
6697
6698     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6699
6700     if (rx && RXp_PAREN_NAMES(rx)) {
6701         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6702             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6703         } else if (flags & RXapif_ONE) {
6704             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6705             av = MUTABLE_AV(SvRV(ret));
6706             length = av_len(av);
6707             SvREFCNT_dec_NN(ret);
6708             return newSViv(length + 1);
6709         } else {
6710             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6711             return NULL;
6712         }
6713     }
6714     return &PL_sv_undef;
6715 }
6716
6717 SV*
6718 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6719 {
6720     struct regexp *const rx = ReANY(r);
6721     AV *av = newAV();
6722
6723     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6724
6725     if (rx && RXp_PAREN_NAMES(rx)) {
6726         HV *hv= RXp_PAREN_NAMES(rx);
6727         HE *temphe;
6728         (void)hv_iterinit(hv);
6729         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6730             IV i;
6731             IV parno = 0;
6732             SV* sv_dat = HeVAL(temphe);
6733             I32 *nums = (I32*)SvPVX(sv_dat);
6734             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6735                 if ((I32)(rx->lastparen) >= nums[i] &&
6736                     rx->offs[nums[i]].start != -1 &&
6737                     rx->offs[nums[i]].end != -1)
6738                 {
6739                     parno = nums[i];
6740                     break;
6741                 }
6742             }
6743             if (parno || flags & RXapif_ALL) {
6744                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6745             }
6746         }
6747     }
6748
6749     return newRV_noinc(MUTABLE_SV(av));
6750 }
6751
6752 void
6753 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6754                              SV * const sv)
6755 {
6756     struct regexp *const rx = ReANY(r);
6757     char *s = NULL;
6758     SSize_t i = 0;
6759     SSize_t s1, t1;
6760     I32 n = paren;
6761
6762     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6763
6764     if (      n == RX_BUFF_IDX_CARET_PREMATCH
6765            || n == RX_BUFF_IDX_CARET_FULLMATCH
6766            || n == RX_BUFF_IDX_CARET_POSTMATCH
6767        )
6768     {
6769         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6770         if (!keepcopy) {
6771             /* on something like
6772              *    $r = qr/.../;
6773              *    /$qr/p;
6774              * the KEEPCOPY is set on the PMOP rather than the regex */
6775             if (PL_curpm && r == PM_GETRE(PL_curpm))
6776                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6777         }
6778         if (!keepcopy)
6779             goto ret_undef;
6780     }
6781
6782     if (!rx->subbeg)
6783         goto ret_undef;
6784
6785     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6786         /* no need to distinguish between them any more */
6787         n = RX_BUFF_IDX_FULLMATCH;
6788
6789     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6790         && rx->offs[0].start != -1)
6791     {
6792         /* $`, ${^PREMATCH} */
6793         i = rx->offs[0].start;
6794         s = rx->subbeg;
6795     }
6796     else
6797     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6798         && rx->offs[0].end != -1)
6799     {
6800         /* $', ${^POSTMATCH} */
6801         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6802         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6803     }
6804     else
6805     if ( 0 <= n && n <= (I32)rx->nparens &&
6806         (s1 = rx->offs[n].start) != -1 &&
6807         (t1 = rx->offs[n].end) != -1)
6808     {
6809         /* $&, ${^MATCH},  $1 ... */
6810         i = t1 - s1;
6811         s = rx->subbeg + s1 - rx->suboffset;
6812     } else {
6813         goto ret_undef;
6814     }
6815
6816     assert(s >= rx->subbeg);
6817     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
6818     if (i >= 0) {
6819 #if NO_TAINT_SUPPORT
6820         sv_setpvn(sv, s, i);
6821 #else
6822         const int oldtainted = TAINT_get;
6823         TAINT_NOT;
6824         sv_setpvn(sv, s, i);
6825         TAINT_set(oldtainted);
6826 #endif
6827         if ( (rx->extflags & RXf_CANY_SEEN)
6828             ? (RXp_MATCH_UTF8(rx)
6829                         && (!i || is_utf8_string((U8*)s, i)))
6830             : (RXp_MATCH_UTF8(rx)) )
6831         {
6832             SvUTF8_on(sv);
6833         }
6834         else
6835             SvUTF8_off(sv);
6836         if (TAINTING_get) {
6837             if (RXp_MATCH_TAINTED(rx)) {
6838                 if (SvTYPE(sv) >= SVt_PVMG) {
6839                     MAGIC* const mg = SvMAGIC(sv);
6840                     MAGIC* mgt;
6841                     TAINT;
6842                     SvMAGIC_set(sv, mg->mg_moremagic);
6843                     SvTAINT(sv);
6844                     if ((mgt = SvMAGIC(sv))) {
6845                         mg->mg_moremagic = mgt;
6846                         SvMAGIC_set(sv, mg);
6847                     }
6848                 } else {
6849                     TAINT;
6850                     SvTAINT(sv);
6851                 }
6852             } else
6853                 SvTAINTED_off(sv);
6854         }
6855     } else {
6856       ret_undef:
6857         sv_setsv(sv,&PL_sv_undef);
6858         return;
6859     }
6860 }
6861
6862 void
6863 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6864                                                          SV const * const value)
6865 {
6866     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6867
6868     PERL_UNUSED_ARG(rx);
6869     PERL_UNUSED_ARG(paren);
6870     PERL_UNUSED_ARG(value);
6871
6872     if (!PL_localizing)
6873         Perl_croak_no_modify();
6874 }
6875
6876 I32
6877 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6878                               const I32 paren)
6879 {
6880     struct regexp *const rx = ReANY(r);
6881     I32 i;
6882     I32 s1, t1;
6883
6884     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6885
6886     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
6887         || paren == RX_BUFF_IDX_CARET_FULLMATCH
6888         || paren == RX_BUFF_IDX_CARET_POSTMATCH
6889     )
6890     {
6891         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6892         if (!keepcopy) {
6893             /* on something like
6894              *    $r = qr/.../;
6895              *    /$qr/p;
6896              * the KEEPCOPY is set on the PMOP rather than the regex */
6897             if (PL_curpm && r == PM_GETRE(PL_curpm))
6898                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6899         }
6900         if (!keepcopy)
6901             goto warn_undef;
6902     }
6903
6904     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6905     switch (paren) {
6906       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6907       case RX_BUFF_IDX_PREMATCH:       /* $` */
6908         if (rx->offs[0].start != -1) {
6909                         i = rx->offs[0].start;
6910                         if (i > 0) {
6911                                 s1 = 0;
6912                                 t1 = i;
6913                                 goto getlen;
6914                         }
6915             }
6916         return 0;
6917
6918       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6919       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6920             if (rx->offs[0].end != -1) {
6921                         i = rx->sublen - rx->offs[0].end;
6922                         if (i > 0) {
6923                                 s1 = rx->offs[0].end;
6924                                 t1 = rx->sublen;
6925                                 goto getlen;
6926                         }
6927             }
6928         return 0;
6929
6930       default: /* $& / ${^MATCH}, $1, $2, ... */
6931             if (paren <= (I32)rx->nparens &&
6932             (s1 = rx->offs[paren].start) != -1 &&
6933             (t1 = rx->offs[paren].end) != -1)
6934             {
6935             i = t1 - s1;
6936             goto getlen;
6937         } else {
6938           warn_undef:
6939             if (ckWARN(WARN_UNINITIALIZED))
6940                 report_uninit((const SV *)sv);
6941             return 0;
6942         }
6943     }
6944   getlen:
6945     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6946         const char * const s = rx->subbeg - rx->suboffset + s1;
6947         const U8 *ep;
6948         STRLEN el;
6949
6950         i = t1 - s1;
6951         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6952                         i = el;
6953     }
6954     return i;
6955 }
6956
6957 SV*
6958 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6959 {
6960     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6961         PERL_UNUSED_ARG(rx);
6962         if (0)
6963             return NULL;
6964         else
6965             return newSVpvs("Regexp");
6966 }
6967
6968 /* Scans the name of a named buffer from the pattern.
6969  * If flags is REG_RSN_RETURN_NULL returns null.
6970  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6971  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6972  * to the parsed name as looked up in the RExC_paren_names hash.
6973  * If there is an error throws a vFAIL().. type exception.
6974  */
6975
6976 #define REG_RSN_RETURN_NULL    0
6977 #define REG_RSN_RETURN_NAME    1
6978 #define REG_RSN_RETURN_DATA    2
6979
6980 STATIC SV*
6981 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6982 {
6983     char *name_start = RExC_parse;
6984
6985     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6986
6987     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6988          /* skip IDFIRST by using do...while */
6989         if (UTF)
6990             do {
6991                 RExC_parse += UTF8SKIP(RExC_parse);
6992             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6993         else
6994             do {
6995                 RExC_parse++;
6996             } while (isWORDCHAR(*RExC_parse));
6997     } else {
6998         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6999         vFAIL("Group name must start with a non-digit word character");
7000     }
7001     if ( flags ) {
7002         SV* sv_name
7003             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7004                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7005         if ( flags == REG_RSN_RETURN_NAME)
7006             return sv_name;
7007         else if (flags==REG_RSN_RETURN_DATA) {
7008             HE *he_str = NULL;
7009             SV *sv_dat = NULL;
7010             if ( ! sv_name )      /* should not happen*/
7011                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7012             if (RExC_paren_names)
7013                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7014             if ( he_str )
7015                 sv_dat = HeVAL(he_str);
7016             if ( ! sv_dat )
7017                 vFAIL("Reference to nonexistent named group");
7018             return sv_dat;
7019         }
7020         else {
7021             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7022                        (unsigned long) flags);
7023         }
7024         assert(0); /* NOT REACHED */
7025     }
7026     return NULL;
7027 }
7028
7029 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7030     int rem=(int)(RExC_end - RExC_parse);                       \
7031     int cut;                                                    \
7032     int num;                                                    \
7033     int iscut=0;                                                \
7034     if (rem>10) {                                               \
7035         rem=10;                                                 \
7036         iscut=1;                                                \
7037     }                                                           \
7038     cut=10-rem;                                                 \
7039     if (RExC_lastparse!=RExC_parse)                             \
7040         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7041             rem, RExC_parse,                                    \
7042             cut + 4,                                            \
7043             iscut ? "..." : "<"                                 \
7044         );                                                      \
7045     else                                                        \
7046         PerlIO_printf(Perl_debug_log,"%16s","");                \
7047                                                                 \
7048     if (SIZE_ONLY)                                              \
7049        num = RExC_size + 1;                                     \
7050     else                                                        \
7051        num=REG_NODE_NUM(RExC_emit);                             \
7052     if (RExC_lastnum!=num)                                      \
7053        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7054     else                                                        \
7055        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7056     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7057         (int)((depth*2)), "",                                   \
7058         (funcname)                                              \
7059     );                                                          \
7060     RExC_lastnum=num;                                           \
7061     RExC_lastparse=RExC_parse;                                  \
7062 })
7063
7064
7065
7066 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7067     DEBUG_PARSE_MSG((funcname));                            \
7068     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7069 })
7070 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7071     DEBUG_PARSE_MSG((funcname));                            \
7072     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7073 })
7074
7075 /* This section of code defines the inversion list object and its methods.  The
7076  * interfaces are highly subject to change, so as much as possible is static to
7077  * this file.  An inversion list is here implemented as a malloc'd C UV array
7078  * as an SVt_INVLIST scalar.
7079  *
7080  * An inversion list for Unicode is an array of code points, sorted by ordinal
7081  * number.  The zeroth element is the first code point in the list.  The 1th
7082  * element is the first element beyond that not in the list.  In other words,
7083  * the first range is
7084  *  invlist[0]..(invlist[1]-1)
7085  * The other ranges follow.  Thus every element whose index is divisible by two
7086  * marks the beginning of a range that is in the list, and every element not
7087  * divisible by two marks the beginning of a range not in the list.  A single
7088  * element inversion list that contains the single code point N generally
7089  * consists of two elements
7090  *  invlist[0] == N
7091  *  invlist[1] == N+1
7092  * (The exception is when N is the highest representable value on the
7093  * machine, in which case the list containing just it would be a single
7094  * element, itself.  By extension, if the last range in the list extends to
7095  * infinity, then the first element of that range will be in the inversion list
7096  * at a position that is divisible by two, and is the final element in the
7097  * list.)
7098  * Taking the complement (inverting) an inversion list is quite simple, if the
7099  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7100  * This implementation reserves an element at the beginning of each inversion
7101  * list to always contain 0; there is an additional flag in the header which
7102  * indicates if the list begins at the 0, or is offset to begin at the next
7103  * element.
7104  *
7105  * More about inversion lists can be found in "Unicode Demystified"
7106  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7107  * More will be coming when functionality is added later.
7108  *
7109  * The inversion list data structure is currently implemented as an SV pointing
7110  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7111  * array of UV whose memory management is automatically handled by the existing
7112  * facilities for SV's.
7113  *
7114  * Some of the methods should always be private to the implementation, and some
7115  * should eventually be made public */
7116
7117 /* The header definitions are in F<inline_invlist.c> */
7118
7119 PERL_STATIC_INLINE UV*
7120 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7121 {
7122     /* Returns a pointer to the first element in the inversion list's array.
7123      * This is called upon initialization of an inversion list.  Where the
7124      * array begins depends on whether the list has the code point U+0000 in it
7125      * or not.  The other parameter tells it whether the code that follows this
7126      * call is about to put a 0 in the inversion list or not.  The first
7127      * element is either the element reserved for 0, if TRUE, or the element
7128      * after it, if FALSE */
7129
7130     bool* offset = get_invlist_offset_addr(invlist);
7131     UV* zero_addr = (UV *) SvPVX(invlist);
7132
7133     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7134
7135     /* Must be empty */
7136     assert(! _invlist_len(invlist));
7137
7138     *zero_addr = 0;
7139
7140     /* 1^1 = 0; 1^0 = 1 */
7141     *offset = 1 ^ will_have_0;
7142     return zero_addr + *offset;
7143 }
7144
7145 PERL_STATIC_INLINE UV*
7146 S_invlist_array(pTHX_ SV* const invlist)
7147 {
7148     /* Returns the pointer to the inversion list's array.  Every time the
7149      * length changes, this needs to be called in case malloc or realloc moved
7150      * it */
7151
7152     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7153
7154     /* Must not be empty.  If these fail, you probably didn't check for <len>
7155      * being non-zero before trying to get the array */
7156     assert(_invlist_len(invlist));
7157
7158     /* The very first element always contains zero, The array begins either
7159      * there, or if the inversion list is offset, at the element after it.
7160      * The offset header field determines which; it contains 0 or 1 to indicate
7161      * how much additionally to add */
7162     assert(0 == *(SvPVX(invlist)));
7163     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7164 }
7165
7166 PERL_STATIC_INLINE void
7167 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7168 {
7169     /* Sets the current number of elements stored in the inversion list.
7170      * Updates SvCUR correspondingly */
7171
7172     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7173
7174     assert(SvTYPE(invlist) == SVt_INVLIST);
7175
7176     SvCUR_set(invlist,
7177               (len == 0)
7178                ? 0
7179                : TO_INTERNAL_SIZE(len + offset));
7180     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7181 }
7182
7183 PERL_STATIC_INLINE IV*
7184 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7185 {
7186     /* Return the address of the IV that is reserved to hold the cached index
7187      * */
7188
7189     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7190
7191     assert(SvTYPE(invlist) == SVt_INVLIST);
7192
7193     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7194 }
7195
7196 PERL_STATIC_INLINE IV
7197 S_invlist_previous_index(pTHX_ SV* const invlist)
7198 {
7199     /* Returns cached index of previous search */
7200
7201     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7202
7203     return *get_invlist_previous_index_addr(invlist);
7204 }
7205
7206 PERL_STATIC_INLINE void
7207 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7208 {
7209     /* Caches <index> for later retrieval */
7210
7211     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7212
7213     assert(index == 0 || index < (int) _invlist_len(invlist));
7214
7215     *get_invlist_previous_index_addr(invlist) = index;
7216 }
7217
7218 PERL_STATIC_INLINE UV
7219 S_invlist_max(pTHX_ SV* const invlist)
7220 {
7221     /* Returns the maximum number of elements storable in the inversion list's
7222      * array, without having to realloc() */
7223
7224     PERL_ARGS_ASSERT_INVLIST_MAX;
7225
7226     assert(SvTYPE(invlist) == SVt_INVLIST);
7227
7228     /* Assumes worst case, in which the 0 element is not counted in the
7229      * inversion list, so subtracts 1 for that */
7230     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7231            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7232            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7233 }
7234
7235 #ifndef PERL_IN_XSUB_RE
7236 SV*
7237 Perl__new_invlist(pTHX_ IV initial_size)
7238 {
7239
7240     /* Return a pointer to a newly constructed inversion list, with enough
7241      * space to store 'initial_size' elements.  If that number is negative, a
7242      * system default is used instead */
7243
7244     SV* new_list;
7245
7246     if (initial_size < 0) {
7247         initial_size = 10;
7248     }
7249
7250     /* Allocate the initial space */
7251     new_list = newSV_type(SVt_INVLIST);
7252
7253     /* First 1 is in case the zero element isn't in the list; second 1 is for
7254      * trailing NUL */
7255     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7256     invlist_set_len(new_list, 0, 0);
7257
7258     /* Force iterinit() to be used to get iteration to work */
7259     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7260
7261     *get_invlist_previous_index_addr(new_list) = 0;
7262
7263     return new_list;
7264 }
7265 #endif
7266
7267 STATIC SV*
7268 S__new_invlist_C_array(pTHX_ const UV* const list)
7269 {
7270     /* Return a pointer to a newly constructed inversion list, initialized to
7271      * point to <list>, which has to be in the exact correct inversion list
7272      * form, including internal fields.  Thus this is a dangerous routine that
7273      * should not be used in the wrong hands.  The passed in 'list' contains
7274      * several header fields at the beginning that are not part of the
7275      * inversion list body proper */
7276
7277     const STRLEN length = (STRLEN) list[0];
7278     const UV version_id =          list[1];
7279     const bool offset   =    cBOOL(list[2]);
7280 #define HEADER_LENGTH 3
7281     /* If any of the above changes in any way, you must change HEADER_LENGTH
7282      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7283      *      perl -E 'say int(rand 2**31-1)'
7284      */
7285 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7286                                         data structure type, so that one being
7287                                         passed in can be validated to be an
7288                                         inversion list of the correct vintage.
7289                                        */
7290
7291     SV* invlist = newSV_type(SVt_INVLIST);
7292
7293     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7294
7295     if (version_id != INVLIST_VERSION_ID) {
7296         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7297     }
7298
7299     /* The generated array passed in includes header elements that aren't part
7300      * of the list proper, so start it just after them */
7301     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7302
7303     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7304                                shouldn't touch it */
7305
7306     *(get_invlist_offset_addr(invlist)) = offset;
7307
7308     /* The 'length' passed to us is the physical number of elements in the
7309      * inversion list.  But if there is an offset the logical number is one
7310      * less than that */
7311     invlist_set_len(invlist, length  - offset, offset);
7312
7313     invlist_set_previous_index(invlist, 0);
7314
7315     /* Initialize the iteration pointer. */
7316     invlist_iterfinish(invlist);
7317
7318     return invlist;
7319 }
7320
7321 STATIC void
7322 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7323 {
7324     /* Grow the maximum size of an inversion list */
7325
7326     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7327
7328     assert(SvTYPE(invlist) == SVt_INVLIST);
7329
7330     /* Add one to account for the zero element at the beginning which may not
7331      * be counted by the calling parameters */
7332     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7333 }
7334
7335 PERL_STATIC_INLINE void
7336 S_invlist_trim(pTHX_ SV* const invlist)
7337 {
7338     PERL_ARGS_ASSERT_INVLIST_TRIM;
7339
7340     assert(SvTYPE(invlist) == SVt_INVLIST);
7341
7342     /* Change the length of the inversion list to how many entries it currently
7343      * has */
7344     SvPV_shrink_to_cur((SV *) invlist);
7345 }
7346
7347 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7348
7349 STATIC void
7350 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7351 {
7352    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7353     * the end of the inversion list.  The range must be above any existing
7354     * ones. */
7355
7356     UV* array;
7357     UV max = invlist_max(invlist);
7358     UV len = _invlist_len(invlist);
7359     bool offset;
7360
7361     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7362
7363     if (len == 0) { /* Empty lists must be initialized */
7364         offset = start != 0;
7365         array = _invlist_array_init(invlist, ! offset);
7366     }
7367     else {
7368         /* Here, the existing list is non-empty. The current max entry in the
7369          * list is generally the first value not in the set, except when the
7370          * set extends to the end of permissible values, in which case it is
7371          * the first entry in that final set, and so this call is an attempt to
7372          * append out-of-order */
7373
7374         UV final_element = len - 1;
7375         array = invlist_array(invlist);
7376         if (array[final_element] > start
7377             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7378         {
7379             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7380                        array[final_element], start,
7381                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7382         }
7383
7384         /* Here, it is a legal append.  If the new range begins with the first
7385          * value not in the set, it is extending the set, so the new first
7386          * value not in the set is one greater than the newly extended range.
7387          * */
7388         offset = *get_invlist_offset_addr(invlist);
7389         if (array[final_element] == start) {
7390             if (end != UV_MAX) {
7391                 array[final_element] = end + 1;
7392             }
7393             else {
7394                 /* But if the end is the maximum representable on the machine,
7395                  * just let the range that this would extend to have no end */
7396                 invlist_set_len(invlist, len - 1, offset);
7397             }
7398             return;
7399         }
7400     }
7401
7402     /* Here the new range doesn't extend any existing set.  Add it */
7403
7404     len += 2;   /* Includes an element each for the start and end of range */
7405
7406     /* If wll overflow the existing space, extend, which may cause the array to
7407      * be moved */
7408     if (max < len) {
7409         invlist_extend(invlist, len);
7410
7411         /* Have to set len here to avoid assert failure in invlist_array() */
7412         invlist_set_len(invlist, len, offset);
7413
7414         array = invlist_array(invlist);
7415     }
7416     else {
7417         invlist_set_len(invlist, len, offset);
7418     }
7419
7420     /* The next item on the list starts the range, the one after that is
7421      * one past the new range.  */
7422     array[len - 2] = start;
7423     if (end != UV_MAX) {
7424         array[len - 1] = end + 1;
7425     }
7426     else {
7427         /* But if the end is the maximum representable on the machine, just let
7428          * the range have no end */
7429         invlist_set_len(invlist, len - 1, offset);
7430     }
7431 }
7432
7433 #ifndef PERL_IN_XSUB_RE
7434
7435 IV
7436 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7437 {
7438     /* Searches the inversion list for the entry that contains the input code
7439      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7440      * return value is the index into the list's array of the range that
7441      * contains <cp> */
7442
7443     IV low = 0;
7444     IV mid;
7445     IV high = _invlist_len(invlist);
7446     const IV highest_element = high - 1;
7447     const UV* array;
7448
7449     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7450
7451     /* If list is empty, return failure. */
7452     if (high == 0) {
7453         return -1;
7454     }
7455
7456     /* (We can't get the array unless we know the list is non-empty) */
7457     array = invlist_array(invlist);
7458
7459     mid = invlist_previous_index(invlist);
7460     assert(mid >=0 && mid <= highest_element);
7461
7462     /* <mid> contains the cache of the result of the previous call to this
7463      * function (0 the first time).  See if this call is for the same result,
7464      * or if it is for mid-1.  This is under the theory that calls to this
7465      * function will often be for related code points that are near each other.
7466      * And benchmarks show that caching gives better results.  We also test
7467      * here if the code point is within the bounds of the list.  These tests
7468      * replace others that would have had to be made anyway to make sure that
7469      * the array bounds were not exceeded, and these give us extra information
7470      * at the same time */
7471     if (cp >= array[mid]) {
7472         if (cp >= array[highest_element]) {
7473             return highest_element;
7474         }
7475
7476         /* Here, array[mid] <= cp < array[highest_element].  This means that
7477          * the final element is not the answer, so can exclude it; it also
7478          * means that <mid> is not the final element, so can refer to 'mid + 1'
7479          * safely */
7480         if (cp < array[mid + 1]) {
7481             return mid;
7482         }
7483         high--;
7484         low = mid + 1;
7485     }
7486     else { /* cp < aray[mid] */
7487         if (cp < array[0]) { /* Fail if outside the array */
7488             return -1;
7489         }
7490         high = mid;
7491         if (cp >= array[mid - 1]) {
7492             goto found_entry;
7493         }
7494     }
7495
7496     /* Binary search.  What we are looking for is <i> such that
7497      *  array[i] <= cp < array[i+1]
7498      * The loop below converges on the i+1.  Note that there may not be an
7499      * (i+1)th element in the array, and things work nonetheless */
7500     while (low < high) {
7501         mid = (low + high) / 2;
7502         assert(mid <= highest_element);
7503         if (array[mid] <= cp) { /* cp >= array[mid] */
7504             low = mid + 1;
7505
7506             /* We could do this extra test to exit the loop early.
7507             if (cp < array[low]) {
7508                 return mid;
7509             }
7510             */
7511         }
7512         else { /* cp < array[mid] */
7513             high = mid;
7514         }
7515     }
7516
7517   found_entry:
7518     high--;
7519     invlist_set_previous_index(invlist, high);
7520     return high;
7521 }
7522
7523 void
7524 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7525 {
7526     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7527      * but is used when the swash has an inversion list.  This makes this much
7528      * faster, as it uses a binary search instead of a linear one.  This is
7529      * intimately tied to that function, and perhaps should be in utf8.c,
7530      * except it is intimately tied to inversion lists as well.  It assumes
7531      * that <swatch> is all 0's on input */
7532
7533     UV current = start;
7534     const IV len = _invlist_len(invlist);
7535     IV i;
7536     const UV * array;
7537
7538     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7539
7540     if (len == 0) { /* Empty inversion list */
7541         return;
7542     }
7543
7544     array = invlist_array(invlist);
7545
7546     /* Find which element it is */
7547     i = _invlist_search(invlist, start);
7548
7549     /* We populate from <start> to <end> */
7550     while (current < end) {
7551         UV upper;
7552
7553         /* The inversion list gives the results for every possible code point
7554          * after the first one in the list.  Only those ranges whose index is
7555          * even are ones that the inversion list matches.  For the odd ones,
7556          * and if the initial code point is not in the list, we have to skip
7557          * forward to the next element */
7558         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7559             i++;
7560             if (i >= len) { /* Finished if beyond the end of the array */
7561                 return;
7562             }
7563             current = array[i];
7564             if (current >= end) {   /* Finished if beyond the end of what we
7565                                        are populating */
7566                 if (LIKELY(end < UV_MAX)) {
7567                     return;
7568                 }
7569
7570                 /* We get here when the upper bound is the maximum
7571                  * representable on the machine, and we are looking for just
7572                  * that code point.  Have to special case it */
7573                 i = len;
7574                 goto join_end_of_list;
7575             }
7576         }
7577         assert(current >= start);
7578
7579         /* The current range ends one below the next one, except don't go past
7580          * <end> */
7581         i++;
7582         upper = (i < len && array[i] < end) ? array[i] : end;
7583
7584         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7585          * for each code point in it */
7586         for (; current < upper; current++) {
7587             const STRLEN offset = (STRLEN)(current - start);
7588             swatch[offset >> 3] |= 1 << (offset & 7);
7589         }
7590
7591     join_end_of_list:
7592
7593         /* Quit if at the end of the list */
7594         if (i >= len) {
7595
7596             /* But first, have to deal with the highest possible code point on
7597              * the platform.  The previous code assumes that <end> is one
7598              * beyond where we want to populate, but that is impossible at the
7599              * platform's infinity, so have to handle it specially */
7600             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7601             {
7602                 const STRLEN offset = (STRLEN)(end - start);
7603                 swatch[offset >> 3] |= 1 << (offset & 7);
7604             }
7605             return;
7606         }
7607
7608         /* Advance to the next range, which will be for code points not in the
7609          * inversion list */
7610         current = array[i];
7611     }
7612
7613     return;
7614 }
7615
7616 void
7617 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7618 {
7619     /* Take the union of two inversion lists and point <output> to it.  *output
7620      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7621      * the reference count to that list will be decremented.  The first list,
7622      * <a>, may be NULL, in which case a copy of the second list is returned.
7623      * If <complement_b> is TRUE, the union is taken of the complement
7624      * (inversion) of <b> instead of b itself.
7625      *
7626      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7627      * Richard Gillam, published by Addison-Wesley, and explained at some
7628      * length there.  The preface says to incorporate its examples into your
7629      * code at your own risk.
7630      *
7631      * The algorithm is like a merge sort.
7632      *
7633      * XXX A potential performance improvement is to keep track as we go along
7634      * if only one of the inputs contributes to the result, meaning the other
7635      * is a subset of that one.  In that case, we can skip the final copy and
7636      * return the larger of the input lists, but then outside code might need
7637      * to keep track of whether to free the input list or not */
7638
7639     const UV* array_a;    /* a's array */
7640     const UV* array_b;
7641     UV len_a;       /* length of a's array */
7642     UV len_b;
7643
7644     SV* u;                      /* the resulting union */
7645     UV* array_u;
7646     UV len_u;
7647
7648     UV i_a = 0;             /* current index into a's array */
7649     UV i_b = 0;
7650     UV i_u = 0;
7651
7652     /* running count, as explained in the algorithm source book; items are
7653      * stopped accumulating and are output when the count changes to/from 0.
7654      * The count is incremented when we start a range that's in the set, and
7655      * decremented when we start a range that's not in the set.  So its range
7656      * is 0 to 2.  Only when the count is zero is something not in the set.
7657      */
7658     UV count = 0;
7659
7660     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7661     assert(a != b);
7662
7663     /* If either one is empty, the union is the other one */
7664     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7665         if (*output == a) {
7666             if (a != NULL) {
7667                 SvREFCNT_dec_NN(a);
7668             }
7669         }
7670         if (*output != b) {
7671             *output = invlist_clone(b);
7672             if (complement_b) {
7673                 _invlist_invert(*output);
7674             }
7675         } /* else *output already = b; */
7676         return;
7677     }
7678     else if ((len_b = _invlist_len(b)) == 0) {
7679         if (*output == b) {
7680             SvREFCNT_dec_NN(b);
7681         }
7682
7683         /* The complement of an empty list is a list that has everything in it,
7684          * so the union with <a> includes everything too */
7685         if (complement_b) {
7686             if (a == *output) {
7687                 SvREFCNT_dec_NN(a);
7688             }
7689             *output = _new_invlist(1);
7690             _append_range_to_invlist(*output, 0, UV_MAX);
7691         }
7692         else if (*output != a) {
7693             *output = invlist_clone(a);
7694         }
7695         /* else *output already = a; */
7696         return;
7697     }
7698
7699     /* Here both lists exist and are non-empty */
7700     array_a = invlist_array(a);
7701     array_b = invlist_array(b);
7702
7703     /* If are to take the union of 'a' with the complement of b, set it
7704      * up so are looking at b's complement. */
7705     if (complement_b) {
7706
7707         /* To complement, we invert: if the first element is 0, remove it.  To
7708          * do this, we just pretend the array starts one later */
7709         if (array_b[0] == 0) {
7710             array_b++;
7711             len_b--;
7712         }
7713         else {
7714
7715             /* But if the first element is not zero, we pretend the list starts
7716              * at the 0 that is always stored immediately before the array. */
7717             array_b--;
7718             len_b++;
7719         }
7720     }
7721
7722     /* Size the union for the worst case: that the sets are completely
7723      * disjoint */
7724     u = _new_invlist(len_a + len_b);
7725
7726     /* Will contain U+0000 if either component does */
7727     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7728                                       || (len_b > 0 && array_b[0] == 0));
7729
7730     /* Go through each list item by item, stopping when exhausted one of
7731      * them */
7732     while (i_a < len_a && i_b < len_b) {
7733         UV cp;      /* The element to potentially add to the union's array */
7734         bool cp_in_set;   /* is it in the the input list's set or not */
7735
7736         /* We need to take one or the other of the two inputs for the union.
7737          * Since we are merging two sorted lists, we take the smaller of the
7738          * next items.  In case of a tie, we take the one that is in its set
7739          * first.  If we took one not in the set first, it would decrement the
7740          * count, possibly to 0 which would cause it to be output as ending the
7741          * range, and the next time through we would take the same number, and
7742          * output it again as beginning the next range.  By doing it the
7743          * opposite way, there is no possibility that the count will be
7744          * momentarily decremented to 0, and thus the two adjoining ranges will
7745          * be seamlessly merged.  (In a tie and both are in the set or both not
7746          * in the set, it doesn't matter which we take first.) */
7747         if (array_a[i_a] < array_b[i_b]
7748             || (array_a[i_a] == array_b[i_b]
7749                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7750         {
7751             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7752             cp= array_a[i_a++];
7753         }
7754         else {
7755             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7756             cp = array_b[i_b++];
7757         }
7758
7759         /* Here, have chosen which of the two inputs to look at.  Only output
7760          * if the running count changes to/from 0, which marks the
7761          * beginning/end of a range in that's in the set */
7762         if (cp_in_set) {
7763             if (count == 0) {
7764                 array_u[i_u++] = cp;
7765             }
7766             count++;
7767         }
7768         else {
7769             count--;
7770             if (count == 0) {
7771                 array_u[i_u++] = cp;
7772             }
7773         }
7774     }
7775
7776     /* Here, we are finished going through at least one of the lists, which
7777      * means there is something remaining in at most one.  We check if the list
7778      * that hasn't been exhausted is positioned such that we are in the middle
7779      * of a range in its set or not.  (i_a and i_b point to the element beyond
7780      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7781      * is potentially more to output.
7782      * There are four cases:
7783      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7784      *     in the union is entirely from the non-exhausted set.
7785      *  2) Both were in their sets, count is 2.  Nothing further should
7786      *     be output, as everything that remains will be in the exhausted
7787      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7788      *     that
7789      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7790      *     Nothing further should be output because the union includes
7791      *     everything from the exhausted set.  Not decrementing ensures that.
7792      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7793      *     decrementing to 0 insures that we look at the remainder of the
7794      *     non-exhausted set */
7795     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7796         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7797     {
7798         count--;
7799     }
7800
7801     /* The final length is what we've output so far, plus what else is about to
7802      * be output.  (If 'count' is non-zero, then the input list we exhausted
7803      * has everything remaining up to the machine's limit in its set, and hence
7804      * in the union, so there will be no further output. */
7805     len_u = i_u;
7806     if (count == 0) {
7807         /* At most one of the subexpressions will be non-zero */
7808         len_u += (len_a - i_a) + (len_b - i_b);
7809     }
7810
7811     /* Set result to final length, which can change the pointer to array_u, so
7812      * re-find it */
7813     if (len_u != _invlist_len(u)) {
7814         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7815         invlist_trim(u);
7816         array_u = invlist_array(u);
7817     }
7818
7819     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7820      * the other) ended with everything above it not in its set.  That means
7821      * that the remaining part of the union is precisely the same as the
7822      * non-exhausted list, so can just copy it unchanged.  (If both list were
7823      * exhausted at the same time, then the operations below will be both 0.)
7824      */
7825     if (count == 0) {
7826         IV copy_count; /* At most one will have a non-zero copy count */
7827         if ((copy_count = len_a - i_a) > 0) {
7828             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7829         }
7830         else if ((copy_count = len_b - i_b) > 0) {
7831             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7832         }
7833     }
7834
7835     /*  We may be removing a reference to one of the inputs */
7836     if (a == *output || b == *output) {
7837         assert(! invlist_is_iterating(*output));
7838         SvREFCNT_dec_NN(*output);
7839     }
7840
7841     *output = u;
7842     return;
7843 }
7844
7845 void
7846 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7847 {
7848     /* Take the intersection of two inversion lists and point <i> to it.  *i
7849      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7850      * the reference count to that list will be decremented.
7851      * If <complement_b> is TRUE, the result will be the intersection of <a>
7852      * and the complement (or inversion) of <b> instead of <b> directly.
7853      *
7854      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7855      * Richard Gillam, published by Addison-Wesley, and explained at some
7856      * length there.  The preface says to incorporate its examples into your
7857      * code at your own risk.  In fact, it had bugs
7858      *
7859      * The algorithm is like a merge sort, and is essentially the same as the
7860      * union above
7861      */
7862
7863     const UV* array_a;          /* a's array */
7864     const UV* array_b;
7865     UV len_a;   /* length of a's array */
7866     UV len_b;
7867
7868     SV* r;                   /* the resulting intersection */
7869     UV* array_r;
7870     UV len_r;
7871
7872     UV i_a = 0;             /* current index into a's array */
7873     UV i_b = 0;
7874     UV i_r = 0;
7875
7876     /* running count, as explained in the algorithm source book; items are
7877      * stopped accumulating and are output when the count changes to/from 2.
7878      * The count is incremented when we start a range that's in the set, and
7879      * decremented when we start a range that's not in the set.  So its range
7880      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7881      */
7882     UV count = 0;
7883
7884     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7885     assert(a != b);
7886
7887     /* Special case if either one is empty */
7888     len_a = (a == NULL) ? 0 : _invlist_len(a);
7889     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7890
7891         if (len_a != 0 && complement_b) {
7892
7893             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7894              * be empty.  Here, also we are using 'b's complement, which hence
7895              * must be every possible code point.  Thus the intersection is
7896              * simply 'a'. */
7897             if (*i != a) {
7898                 if (*i == b) {
7899                     SvREFCNT_dec_NN(b);
7900                 }
7901
7902                 *i = invlist_clone(a);
7903             }
7904             /* else *i is already 'a' */
7905             return;
7906         }
7907
7908         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7909          * intersection must be empty */
7910         if (*i == a) {
7911             SvREFCNT_dec_NN(a);
7912         }
7913         else if (*i == b) {
7914             SvREFCNT_dec_NN(b);
7915         }
7916         *i = _new_invlist(0);
7917         return;
7918     }
7919
7920     /* Here both lists exist and are non-empty */
7921     array_a = invlist_array(a);
7922     array_b = invlist_array(b);
7923
7924     /* If are to take the intersection of 'a' with the complement of b, set it
7925      * up so are looking at b's complement. */
7926     if (complement_b) {
7927
7928         /* To complement, we invert: if the first element is 0, remove it.  To
7929          * do this, we just pretend the array starts one later */
7930         if (array_b[0] == 0) {
7931             array_b++;
7932             len_b--;
7933         }
7934         else {
7935
7936             /* But if the first element is not zero, we pretend the list starts
7937              * at the 0 that is always stored immediately before the array. */
7938             array_b--;
7939             len_b++;
7940         }
7941     }
7942
7943     /* Size the intersection for the worst case: that the intersection ends up
7944      * fragmenting everything to be completely disjoint */
7945     r= _new_invlist(len_a + len_b);
7946
7947     /* Will contain U+0000 iff both components do */
7948     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7949                                      && len_b > 0 && array_b[0] == 0);
7950
7951     /* Go through each list item by item, stopping when exhausted one of
7952      * them */
7953     while (i_a < len_a && i_b < len_b) {
7954         UV cp;      /* The element to potentially add to the intersection's
7955                        array */
7956         bool cp_in_set; /* Is it in the input list's set or not */
7957
7958         /* We need to take one or the other of the two inputs for the
7959          * intersection.  Since we are merging two sorted lists, we take the
7960          * smaller of the next items.  In case of a tie, we take the one that
7961          * is not in its set first (a difference from the union algorithm).  If
7962          * we took one in the set first, it would increment the count, possibly
7963          * to 2 which would cause it to be output as starting a range in the
7964          * intersection, and the next time through we would take that same
7965          * number, and output it again as ending the set.  By doing it the
7966          * opposite of this, there is no possibility that the count will be
7967          * momentarily incremented to 2.  (In a tie and both are in the set or
7968          * both not in the set, it doesn't matter which we take first.) */
7969         if (array_a[i_a] < array_b[i_b]
7970             || (array_a[i_a] == array_b[i_b]
7971                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7972         {
7973             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7974             cp= array_a[i_a++];
7975         }
7976         else {
7977             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7978             cp= array_b[i_b++];
7979         }
7980
7981         /* Here, have chosen which of the two inputs to look at.  Only output
7982          * if the running count changes to/from 2, which marks the
7983          * beginning/end of a range that's in the intersection */
7984         if (cp_in_set) {
7985             count++;
7986             if (count == 2) {
7987                 array_r[i_r++] = cp;
7988             }
7989         }
7990         else {
7991             if (count == 2) {
7992                 array_r[i_r++] = cp;
7993             }
7994             count--;
7995         }
7996     }
7997
7998     /* Here, we are finished going through at least one of the lists, which
7999      * means there is something remaining in at most one.  We check if the list
8000      * that has been exhausted is positioned such that we are in the middle
8001      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8002      * the ones we care about.)  There are four cases:
8003      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8004      *     nothing left in the intersection.
8005      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8006      *     above 2.  What should be output is exactly that which is in the
8007      *     non-exhausted set, as everything it has is also in the intersection
8008      *     set, and everything it doesn't have can't be in the intersection
8009      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8010      *     gets incremented to 2.  Like the previous case, the intersection is
8011      *     everything that remains in the non-exhausted set.
8012      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8013      *     remains 1.  And the intersection has nothing more. */
8014     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8015         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8016     {
8017         count++;
8018     }
8019
8020     /* The final length is what we've output so far plus what else is in the
8021      * intersection.  At most one of the subexpressions below will be non-zero */
8022     len_r = i_r;
8023     if (count >= 2) {
8024         len_r += (len_a - i_a) + (len_b - i_b);
8025     }
8026
8027     /* Set result to final length, which can change the pointer to array_r, so
8028      * re-find it */
8029     if (len_r != _invlist_len(r)) {
8030         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8031         invlist_trim(r);
8032         array_r = invlist_array(r);
8033     }
8034
8035     /* Finish outputting any remaining */
8036     if (count >= 2) { /* At most one will have a non-zero copy count */
8037         IV copy_count;
8038         if ((copy_count = len_a - i_a) > 0) {
8039             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8040         }
8041         else if ((copy_count = len_b - i_b) > 0) {
8042             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8043         }
8044     }
8045
8046     /*  We may be removing a reference to one of the inputs */
8047     if (a == *i || b == *i) {
8048         assert(! invlist_is_iterating(*i));
8049         SvREFCNT_dec_NN(*i);
8050     }
8051
8052     *i = r;
8053     return;
8054 }
8055
8056 SV*
8057 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8058 {
8059     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8060      * set.  A pointer to the inversion list is returned.  This may actually be
8061      * a new list, in which case the passed in one has been destroyed.  The
8062      * passed in inversion list can be NULL, in which case a new one is created
8063      * with just the one range in it */
8064
8065     SV* range_invlist;
8066     UV len;
8067
8068     if (invlist == NULL) {
8069         invlist = _new_invlist(2);
8070         len = 0;
8071     }
8072     else {
8073         len = _invlist_len(invlist);
8074     }
8075
8076     /* If comes after the final entry actually in the list, can just append it
8077      * to the end, */
8078     if (len == 0
8079         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8080             && start >= invlist_array(invlist)[len - 1]))
8081     {
8082         _append_range_to_invlist(invlist, start, end);
8083         return invlist;
8084     }
8085
8086     /* Here, can't just append things, create and return a new inversion list
8087      * which is the union of this range and the existing inversion list */
8088     range_invlist = _new_invlist(2);
8089     _append_range_to_invlist(range_invlist, start, end);
8090
8091     _invlist_union(invlist, range_invlist, &invlist);
8092
8093     /* The temporary can be freed */
8094     SvREFCNT_dec_NN(range_invlist);
8095
8096     return invlist;
8097 }
8098
8099 #endif
8100
8101 PERL_STATIC_INLINE SV*
8102 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8103     return _add_range_to_invlist(invlist, cp, cp);
8104 }
8105
8106 #ifndef PERL_IN_XSUB_RE
8107 void
8108 Perl__invlist_invert(pTHX_ SV* const invlist)
8109 {
8110     /* Complement the input inversion list.  This adds a 0 if the list didn't
8111      * have a zero; removes it otherwise.  As described above, the data
8112      * structure is set up so that this is very efficient */
8113
8114     PERL_ARGS_ASSERT__INVLIST_INVERT;
8115
8116     assert(! invlist_is_iterating(invlist));
8117
8118     /* The inverse of matching nothing is matching everything */
8119     if (_invlist_len(invlist) == 0) {
8120         _append_range_to_invlist(invlist, 0, UV_MAX);
8121         return;
8122     }
8123
8124     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8125 }
8126
8127 void
8128 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8129 {
8130     /* Complement the input inversion list (which must be a Unicode property,
8131      * all of which don't match above the Unicode maximum code point.)  And
8132      * Perl has chosen to not have the inversion match above that either.  This
8133      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8134      */
8135
8136     UV len;
8137     UV* array;
8138
8139     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8140
8141     _invlist_invert(invlist);
8142
8143     len = _invlist_len(invlist);
8144
8145     if (len != 0) { /* If empty do nothing */
8146         array = invlist_array(invlist);
8147         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8148             /* Add 0x110000.  First, grow if necessary */
8149             len++;
8150             if (invlist_max(invlist) < len) {
8151                 invlist_extend(invlist, len);
8152                 array = invlist_array(invlist);
8153             }
8154             invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8155             array[len - 1] = PERL_UNICODE_MAX + 1;
8156         }
8157         else {  /* Remove the 0x110000 */
8158             invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8159         }
8160     }
8161
8162     return;
8163 }
8164 #endif
8165
8166 PERL_STATIC_INLINE SV*
8167 S_invlist_clone(pTHX_ SV* const invlist)
8168 {
8169
8170     /* Return a new inversion list that is a copy of the input one, which is
8171      * unchanged */
8172
8173     /* Need to allocate extra space to accommodate Perl's addition of a
8174      * trailing NUL to SvPV's, since it thinks they are always strings */
8175     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8176     STRLEN physical_length = SvCUR(invlist);
8177     bool offset = *(get_invlist_offset_addr(invlist));
8178
8179     PERL_ARGS_ASSERT_INVLIST_CLONE;
8180
8181     *(get_invlist_offset_addr(new_invlist)) = offset;
8182     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8183     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8184
8185     return new_invlist;
8186 }
8187
8188 PERL_STATIC_INLINE STRLEN*
8189 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8190 {
8191     /* Return the address of the UV that contains the current iteration
8192      * position */
8193
8194     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8195
8196     assert(SvTYPE(invlist) == SVt_INVLIST);
8197
8198     return &(((XINVLIST*) SvANY(invlist))->iterator);
8199 }
8200
8201 PERL_STATIC_INLINE void
8202 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8203 {
8204     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8205
8206     *get_invlist_iter_addr(invlist) = 0;
8207 }
8208
8209 PERL_STATIC_INLINE void
8210 S_invlist_iterfinish(pTHX_ SV* invlist)
8211 {
8212     /* Terminate iterator for invlist.  This is to catch development errors.
8213      * Any iteration that is interrupted before completed should call this
8214      * function.  Functions that add code points anywhere else but to the end
8215      * of an inversion list assert that they are not in the middle of an
8216      * iteration.  If they were, the addition would make the iteration
8217      * problematical: if the iteration hadn't reached the place where things
8218      * were being added, it would be ok */
8219
8220     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8221
8222     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8223 }
8224
8225 STATIC bool
8226 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8227 {
8228     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8229      * This call sets in <*start> and <*end>, the next range in <invlist>.
8230      * Returns <TRUE> if successful and the next call will return the next
8231      * range; <FALSE> if was already at the end of the list.  If the latter,
8232      * <*start> and <*end> are unchanged, and the next call to this function
8233      * will start over at the beginning of the list */
8234
8235     STRLEN* pos = get_invlist_iter_addr(invlist);
8236     UV len = _invlist_len(invlist);
8237     UV *array;
8238
8239     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8240
8241     if (*pos >= len) {
8242         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8243         return FALSE;
8244     }
8245
8246     array = invlist_array(invlist);
8247
8248     *start = array[(*pos)++];
8249
8250     if (*pos >= len) {
8251         *end = UV_MAX;
8252     }
8253     else {
8254         *end = array[(*pos)++] - 1;
8255     }
8256
8257     return TRUE;
8258 }
8259
8260 PERL_STATIC_INLINE bool
8261 S_invlist_is_iterating(pTHX_ SV* const invlist)
8262 {
8263     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8264
8265     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8266 }
8267
8268 PERL_STATIC_INLINE UV
8269 S_invlist_highest(pTHX_ SV* const invlist)
8270 {
8271     /* Returns the highest code point that matches an inversion list.  This API
8272      * has an ambiguity, as it returns 0 under either the highest is actually
8273      * 0, or if the list is empty.  If this distinction matters to you, check
8274      * for emptiness before calling this function */
8275
8276     UV len = _invlist_len(invlist);
8277     UV *array;
8278
8279     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8280
8281     if (len == 0) {
8282         return 0;
8283     }
8284
8285     array = invlist_array(invlist);
8286
8287     /* The last element in the array in the inversion list always starts a
8288      * range that goes to infinity.  That range may be for code points that are
8289      * matched in the inversion list, or it may be for ones that aren't
8290      * matched.  In the latter case, the highest code point in the set is one
8291      * less than the beginning of this range; otherwise it is the final element
8292      * of this range: infinity */
8293     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8294            ? UV_MAX
8295            : array[len - 1] - 1;
8296 }
8297
8298 #ifndef PERL_IN_XSUB_RE
8299 SV *
8300 Perl__invlist_contents(pTHX_ SV* const invlist)
8301 {
8302     /* Get the contents of an inversion list into a string SV so that they can
8303      * be printed out.  It uses the format traditionally done for debug tracing
8304      */
8305
8306     UV start, end;
8307     SV* output = newSVpvs("\n");
8308
8309     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8310
8311     assert(! invlist_is_iterating(invlist));
8312
8313     invlist_iterinit(invlist);
8314     while (invlist_iternext(invlist, &start, &end)) {
8315         if (end == UV_MAX) {
8316             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8317         }
8318         else if (end != start) {
8319             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8320                     start,       end);
8321         }
8322         else {
8323             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8324         }
8325     }
8326
8327     return output;
8328 }
8329 #endif
8330
8331 #ifndef PERL_IN_XSUB_RE
8332 void
8333 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8334 {
8335     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8336      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8337      * the string 'indent'.  The output looks like this:
8338          [0] 0x000A .. 0x000D
8339          [2] 0x0085
8340          [4] 0x2028 .. 0x2029
8341          [6] 0x3104 .. INFINITY
8342      * This means that the first range of code points matched by the list are
8343      * 0xA through 0xD; the second range contains only the single code point
8344      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8345      * are used to define each range (except if the final range extends to
8346      * infinity, only a single element is needed).  The array index of the
8347      * first element for the corresponding range is given in brackets. */
8348
8349     UV start, end;
8350     STRLEN count = 0;
8351
8352     PERL_ARGS_ASSERT__INVLIST_DUMP;
8353
8354     if (invlist_is_iterating(invlist)) {
8355         Perl_dump_indent(aTHX_ level, file,
8356              "%sCan't dump inversion list because is in middle of iterating\n",
8357              indent);
8358         return;
8359     }
8360
8361     invlist_iterinit(invlist);
8362     while (invlist_iternext(invlist, &start, &end)) {
8363         if (end == UV_MAX) {
8364             Perl_dump_indent(aTHX_ level, file,
8365                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8366                                    indent, (UV)count, start);
8367         }
8368         else if (end != start) {
8369             Perl_dump_indent(aTHX_ level, file,
8370                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8371                                 indent, (UV)count, start,         end);
8372         }
8373         else {
8374             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8375                                             indent, (UV)count, start);
8376         }
8377         count += 2;
8378     }
8379 }
8380 #endif
8381
8382 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8383 bool
8384 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8385 {
8386     /* Return a boolean as to if the two passed in inversion lists are
8387      * identical.  The final argument, if TRUE, says to take the complement of
8388      * the second inversion list before doing the comparison */
8389
8390     const UV* array_a = invlist_array(a);
8391     const UV* array_b = invlist_array(b);
8392     UV len_a = _invlist_len(a);
8393     UV len_b = _invlist_len(b);
8394
8395     UV i = 0;               /* current index into the arrays */
8396     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8397
8398     PERL_ARGS_ASSERT__INVLISTEQ;
8399
8400     /* If are to compare 'a' with the complement of b, set it
8401      * up so are looking at b's complement. */
8402     if (complement_b) {
8403
8404         /* The complement of nothing is everything, so <a> would have to have
8405          * just one element, starting at zero (ending at infinity) */
8406         if (len_b == 0) {
8407             return (len_a == 1 && array_a[0] == 0);
8408         }
8409         else if (array_b[0] == 0) {
8410
8411             /* Otherwise, to complement, we invert.  Here, the first element is
8412              * 0, just remove it.  To do this, we just pretend the array starts
8413              * one later */
8414
8415             array_b++;
8416             len_b--;
8417         }
8418         else {
8419
8420             /* But if the first element is not zero, we pretend the list starts
8421              * at the 0 that is always stored immediately before the array. */
8422             array_b--;
8423             len_b++;
8424         }
8425     }
8426
8427     /* Make sure that the lengths are the same, as well as the final element
8428      * before looping through the remainder.  (Thus we test the length, final,
8429      * and first elements right off the bat) */
8430     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8431         retval = FALSE;
8432     }
8433     else for (i = 0; i < len_a - 1; i++) {
8434         if (array_a[i] != array_b[i]) {
8435             retval = FALSE;
8436             break;
8437         }
8438     }
8439
8440     return retval;
8441 }
8442 #endif
8443
8444 #undef HEADER_LENGTH
8445 #undef TO_INTERNAL_SIZE
8446 #undef FROM_INTERNAL_SIZE
8447 #undef INVLIST_VERSION_ID
8448
8449 /* End of inversion list object */
8450
8451 STATIC void
8452 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8453 {
8454     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8455      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8456      * should point to the first flag; it is updated on output to point to the
8457      * final ')' or ':'.  There needs to be at least one flag, or this will
8458      * abort */
8459
8460     /* for (?g), (?gc), and (?o) warnings; warning
8461        about (?c) will warn about (?g) -- japhy    */
8462
8463 #define WASTED_O  0x01
8464 #define WASTED_G  0x02
8465 #define WASTED_C  0x04
8466 #define WASTED_GC (WASTED_G|WASTED_C)
8467     I32 wastedflags = 0x00;
8468     U32 posflags = 0, negflags = 0;
8469     U32 *flagsp = &posflags;
8470     char has_charset_modifier = '\0';
8471     regex_charset cs;
8472     bool has_use_defaults = FALSE;
8473     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8474
8475     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8476
8477     /* '^' as an initial flag sets certain defaults */
8478     if (UCHARAT(RExC_parse) == '^') {
8479         RExC_parse++;
8480         has_use_defaults = TRUE;
8481         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8482         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8483                                         ? REGEX_UNICODE_CHARSET
8484                                         : REGEX_DEPENDS_CHARSET);
8485     }
8486
8487     cs = get_regex_charset(RExC_flags);
8488     if (cs == REGEX_DEPENDS_CHARSET
8489         && (RExC_utf8 || RExC_uni_semantics))
8490     {
8491         cs = REGEX_UNICODE_CHARSET;
8492     }
8493
8494     while (*RExC_parse) {
8495         /* && strchr("iogcmsx", *RExC_parse) */
8496         /* (?g), (?gc) and (?o) are useless here
8497            and must be globally applied -- japhy */
8498         switch (*RExC_parse) {
8499
8500             /* Code for the imsx flags */
8501             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8502
8503             case LOCALE_PAT_MOD:
8504                 if (has_charset_modifier) {
8505                     goto excess_modifier;
8506                 }
8507                 else if (flagsp == &negflags) {
8508                     goto neg_modifier;
8509                 }
8510                 cs = REGEX_LOCALE_CHARSET;
8511                 has_charset_modifier = LOCALE_PAT_MOD;
8512                 RExC_contains_locale = 1;
8513                 break;
8514             case UNICODE_PAT_MOD:
8515                 if (has_charset_modifier) {
8516                     goto excess_modifier;
8517                 }
8518                 else if (flagsp == &negflags) {
8519                     goto neg_modifier;
8520                 }
8521                 cs = REGEX_UNICODE_CHARSET;
8522                 has_charset_modifier = UNICODE_PAT_MOD;
8523                 break;
8524             case ASCII_RESTRICT_PAT_MOD:
8525                 if (flagsp == &negflags) {
8526                     goto neg_modifier;
8527                 }
8528                 if (has_charset_modifier) {
8529                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8530                         goto excess_modifier;
8531                     }
8532                     /* Doubled modifier implies more restricted */
8533                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8534                 }
8535                 else {
8536                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8537                 }
8538                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8539                 break;
8540             case DEPENDS_PAT_MOD:
8541                 if (has_use_defaults) {
8542                     goto fail_modifiers;
8543                 }
8544                 else if (flagsp == &negflags) {
8545                     goto neg_modifier;
8546                 }
8547                 else if (has_charset_modifier) {
8548                     goto excess_modifier;
8549                 }
8550
8551                 /* The dual charset means unicode semantics if the
8552                  * pattern (or target, not known until runtime) are
8553                  * utf8, or something in the pattern indicates unicode
8554                  * semantics */
8555                 cs = (RExC_utf8 || RExC_uni_semantics)
8556                      ? REGEX_UNICODE_CHARSET
8557                      : REGEX_DEPENDS_CHARSET;
8558                 has_charset_modifier = DEPENDS_PAT_MOD;
8559                 break;
8560             excess_modifier:
8561                 RExC_parse++;
8562                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8563                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8564                 }
8565                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8566                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8567                 }
8568                 else {
8569                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8570                 }
8571                 /*NOTREACHED*/
8572             neg_modifier:
8573                 RExC_parse++;
8574                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8575                 /*NOTREACHED*/
8576             case ONCE_PAT_MOD: /* 'o' */
8577             case GLOBAL_PAT_MOD: /* 'g' */
8578                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8579                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8580                     if (! (wastedflags & wflagbit) ) {
8581                         wastedflags |= wflagbit;
8582                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8583                         vWARN5(
8584                             RExC_parse + 1,
8585                             "Useless (%s%c) - %suse /%c modifier",
8586                             flagsp == &negflags ? "?-" : "?",
8587                             *RExC_parse,
8588                             flagsp == &negflags ? "don't " : "",
8589                             *RExC_parse
8590                         );
8591                     }
8592                 }
8593                 break;
8594
8595             case CONTINUE_PAT_MOD: /* 'c' */
8596                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8597                     if (! (wastedflags & WASTED_C) ) {
8598                         wastedflags |= WASTED_GC;
8599                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8600                         vWARN3(
8601                             RExC_parse + 1,
8602                             "Useless (%sc) - %suse /gc modifier",
8603                             flagsp == &negflags ? "?-" : "?",
8604                             flagsp == &negflags ? "don't " : ""
8605                         );
8606                     }
8607                 }
8608                 break;
8609             case KEEPCOPY_PAT_MOD: /* 'p' */
8610                 if (flagsp == &negflags) {
8611                     if (SIZE_ONLY)
8612                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8613                 } else {
8614                     *flagsp |= RXf_PMf_KEEPCOPY;
8615                 }
8616                 break;
8617             case '-':
8618                 /* A flag is a default iff it is following a minus, so
8619                  * if there is a minus, it means will be trying to
8620                  * re-specify a default which is an error */
8621                 if (has_use_defaults || flagsp == &negflags) {
8622                     goto fail_modifiers;
8623                 }
8624                 flagsp = &negflags;
8625                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8626                 break;
8627             case ':':
8628             case ')':
8629                 RExC_flags |= posflags;
8630                 RExC_flags &= ~negflags;
8631                 set_regex_charset(&RExC_flags, cs);
8632                 return;
8633                 /*NOTREACHED*/
8634             default:
8635             fail_modifiers:
8636                 RExC_parse++;
8637                 vFAIL3("Sequence (%.*s...) not recognized",
8638                        RExC_parse-seqstart, seqstart);
8639                 /*NOTREACHED*/
8640         }
8641
8642         ++RExC_parse;
8643     }
8644 }
8645
8646 /*
8647  - reg - regular expression, i.e. main body or parenthesized thing
8648  *
8649  * Caller must absorb opening parenthesis.
8650  *
8651  * Combining parenthesis handling with the base level of regular expression
8652  * is a trifle forced, but the need to tie the tails of the branches to what
8653  * follows makes it hard to avoid.
8654  */
8655 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8656 #ifdef DEBUGGING
8657 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8658 #else
8659 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8660 #endif
8661
8662 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8663    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8664    needs to be restarted.
8665    Otherwise would only return NULL if regbranch() returns NULL, which
8666    cannot happen.  */
8667 STATIC regnode *
8668 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8669     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8670      * 2 is like 1, but indicates that nextchar() has been called to advance
8671      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8672      * this flag alerts us to the need to check for that */
8673 {
8674     dVAR;
8675     regnode *ret;               /* Will be the head of the group. */
8676     regnode *br;
8677     regnode *lastbr;
8678     regnode *ender = NULL;
8679     I32 parno = 0;
8680     I32 flags;
8681     U32 oregflags = RExC_flags;
8682     bool have_branch = 0;
8683     bool is_open = 0;
8684     I32 freeze_paren = 0;
8685     I32 after_freeze = 0;
8686
8687     char * parse_start = RExC_parse; /* MJD */
8688     char * const oregcomp_parse = RExC_parse;
8689
8690     GET_RE_DEBUG_FLAGS_DECL;
8691
8692     PERL_ARGS_ASSERT_REG;
8693     DEBUG_PARSE("reg ");
8694
8695     *flagp = 0;                         /* Tentatively. */
8696
8697
8698     /* Make an OPEN node, if parenthesized. */
8699     if (paren) {
8700
8701         /* Under /x, space and comments can be gobbled up between the '(' and
8702          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8703          * intervening space, as the sequence is a token, and a token should be
8704          * indivisible */
8705         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8706
8707         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8708             char *start_verb = RExC_parse;
8709             STRLEN verb_len = 0;
8710             char *start_arg = NULL;
8711             unsigned char op = 0;
8712             int argok = 1;
8713             int internal_argval = 0; /* internal_argval is only useful if !argok */
8714
8715             if (has_intervening_patws && SIZE_ONLY) {
8716                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8717             }
8718             while ( *RExC_parse && *RExC_parse != ')' ) {
8719                 if ( *RExC_parse == ':' ) {
8720                     start_arg = RExC_parse + 1;
8721                     break;
8722                 }
8723                 RExC_parse++;
8724             }
8725             ++start_verb;
8726             verb_len = RExC_parse - start_verb;
8727             if ( start_arg ) {
8728                 RExC_parse++;
8729                 while ( *RExC_parse && *RExC_parse != ')' )
8730                     RExC_parse++;
8731                 if ( *RExC_parse != ')' )
8732                     vFAIL("Unterminated verb pattern argument");
8733                 if ( RExC_parse == start_arg )
8734                     start_arg = NULL;
8735             } else {
8736                 if ( *RExC_parse != ')' )
8737                     vFAIL("Unterminated verb pattern");
8738             }
8739
8740             switch ( *start_verb ) {
8741             case 'A':  /* (*ACCEPT) */
8742                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8743                     op = ACCEPT;
8744                     internal_argval = RExC_nestroot;
8745                 }
8746                 break;
8747             case 'C':  /* (*COMMIT) */
8748                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8749                     op = COMMIT;
8750                 break;
8751             case 'F':  /* (*FAIL) */
8752                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8753                     op = OPFAIL;
8754                     argok = 0;
8755                 }
8756                 break;
8757             case ':':  /* (*:NAME) */
8758             case 'M':  /* (*MARK:NAME) */
8759                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8760                     op = MARKPOINT;
8761                     argok = -1;
8762                 }
8763                 break;
8764             case 'P':  /* (*PRUNE) */
8765                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8766                     op = PRUNE;
8767                 break;
8768             case 'S':   /* (*SKIP) */
8769                 if ( memEQs(start_verb,verb_len,"SKIP") )
8770                     op = SKIP;
8771                 break;
8772             case 'T':  /* (*THEN) */
8773                 /* [19:06] <TimToady> :: is then */
8774                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8775                     op = CUTGROUP;
8776                     RExC_seen |= REG_SEEN_CUTGROUP;
8777                 }
8778                 break;
8779             }
8780             if ( ! op ) {
8781                 RExC_parse++;
8782                 vFAIL3("Unknown verb pattern '%.*s'",
8783                     verb_len, start_verb);
8784             }
8785             if ( argok ) {
8786                 if ( start_arg && internal_argval ) {
8787                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8788                         verb_len, start_verb);
8789                 } else if ( argok < 0 && !start_arg ) {
8790                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8791                         verb_len, start_verb);
8792                 } else {
8793                     ret = reganode(pRExC_state, op, internal_argval);
8794                     if ( ! internal_argval && ! SIZE_ONLY ) {
8795                         if (start_arg) {
8796                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8797                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8798                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8799                             ret->flags = 0;
8800                         } else {
8801                             ret->flags = 1;
8802                         }
8803                     }
8804                 }
8805                 if (!internal_argval)
8806                     RExC_seen |= REG_SEEN_VERBARG;
8807             } else if ( start_arg ) {
8808                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8809                         verb_len, start_verb);
8810             } else {
8811                 ret = reg_node(pRExC_state, op);
8812             }
8813             nextchar(pRExC_state);
8814             return ret;
8815         }
8816         else if (*RExC_parse == '?') { /* (?...) */
8817             bool is_logical = 0;
8818             const char * const seqstart = RExC_parse;
8819             if (has_intervening_patws && SIZE_ONLY) {
8820                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8821             }
8822
8823             RExC_parse++;
8824             paren = *RExC_parse++;
8825             ret = NULL;                 /* For look-ahead/behind. */
8826             switch (paren) {
8827
8828             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8829                 paren = *RExC_parse++;
8830                 if ( paren == '<')         /* (?P<...>) named capture */
8831                     goto named_capture;
8832                 else if (paren == '>') {   /* (?P>name) named recursion */
8833                     goto named_recursion;
8834                 }
8835                 else if (paren == '=') {   /* (?P=...)  named backref */
8836                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8837                        you change this make sure you change that */
8838                     char* name_start = RExC_parse;
8839                     U32 num = 0;
8840                     SV *sv_dat = reg_scan_name(pRExC_state,
8841                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8842                     if (RExC_parse == name_start || *RExC_parse != ')')
8843                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8844
8845                     if (!SIZE_ONLY) {
8846                         num = add_data( pRExC_state, 1, "S" );
8847                         RExC_rxi->data->data[num]=(void*)sv_dat;
8848                         SvREFCNT_inc_simple_void(sv_dat);
8849                     }
8850                     RExC_sawback = 1;
8851                     ret = reganode(pRExC_state,
8852                                    ((! FOLD)
8853                                      ? NREF
8854                                      : (ASCII_FOLD_RESTRICTED)
8855                                        ? NREFFA
8856                                        : (AT_LEAST_UNI_SEMANTICS)
8857                                          ? NREFFU
8858                                          : (LOC)
8859                                            ? NREFFL
8860                                            : NREFF),
8861                                     num);
8862                     *flagp |= HASWIDTH;
8863
8864                     Set_Node_Offset(ret, parse_start+1);
8865                     Set_Node_Cur_Length(ret, parse_start);
8866
8867                     nextchar(pRExC_state);
8868                     return ret;
8869                 }
8870                 RExC_parse++;
8871                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8872                 /*NOTREACHED*/
8873             case '<':           /* (?<...) */
8874                 if (*RExC_parse == '!')
8875                     paren = ',';
8876                 else if (*RExC_parse != '=')
8877               named_capture:
8878                 {               /* (?<...>) */
8879                     char *name_start;
8880                     SV *svname;
8881                     paren= '>';
8882             case '\'':          /* (?'...') */
8883                     name_start= RExC_parse;
8884                     svname = reg_scan_name(pRExC_state,
8885                         SIZE_ONLY ?  /* reverse test from the others */
8886                         REG_RSN_RETURN_NAME :
8887                         REG_RSN_RETURN_NULL);
8888                     if (RExC_parse == name_start) {
8889                         RExC_parse++;
8890                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8891                         /*NOTREACHED*/
8892                     }
8893                     if (*RExC_parse != paren)
8894                         vFAIL2("Sequence (?%c... not terminated",
8895                             paren=='>' ? '<' : paren);
8896                     if (SIZE_ONLY) {
8897                         HE *he_str;
8898                         SV *sv_dat = NULL;
8899                         if (!svname) /* shouldn't happen */
8900                             Perl_croak(aTHX_
8901                                 "panic: reg_scan_name returned NULL");
8902                         if (!RExC_paren_names) {
8903                             RExC_paren_names= newHV();
8904                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8905 #ifdef DEBUGGING
8906                             RExC_paren_name_list= newAV();
8907                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8908 #endif
8909                         }
8910                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8911                         if ( he_str )
8912                             sv_dat = HeVAL(he_str);
8913                         if ( ! sv_dat ) {
8914                             /* croak baby croak */
8915                             Perl_croak(aTHX_
8916                                 "panic: paren_name hash element allocation failed");
8917                         } else if ( SvPOK(sv_dat) ) {
8918                             /* (?|...) can mean we have dupes so scan to check
8919                                its already been stored. Maybe a flag indicating
8920                                we are inside such a construct would be useful,
8921                                but the arrays are likely to be quite small, so
8922                                for now we punt -- dmq */
8923                             IV count = SvIV(sv_dat);
8924                             I32 *pv = (I32*)SvPVX(sv_dat);
8925                             IV i;
8926                             for ( i = 0 ; i < count ; i++ ) {
8927                                 if ( pv[i] == RExC_npar ) {
8928                                     count = 0;
8929                                     break;
8930                                 }
8931                             }
8932                             if ( count ) {
8933                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8934                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8935                                 pv[count] = RExC_npar;
8936                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8937                             }
8938                         } else {
8939                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8940                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8941                             SvIOK_on(sv_dat);
8942                             SvIV_set(sv_dat, 1);
8943                         }
8944 #ifdef DEBUGGING
8945                         /* Yes this does cause a memory leak in debugging Perls */
8946                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8947                             SvREFCNT_dec_NN(svname);
8948 #endif
8949
8950                         /*sv_dump(sv_dat);*/
8951                     }
8952                     nextchar(pRExC_state);
8953                     paren = 1;
8954                     goto capturing_parens;
8955                 }
8956                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8957                 RExC_in_lookbehind++;
8958                 RExC_parse++;
8959             case '=':           /* (?=...) */
8960                 RExC_seen_zerolen++;
8961                 break;
8962             case '!':           /* (?!...) */
8963                 RExC_seen_zerolen++;
8964                 if (*RExC_parse == ')') {
8965                     ret=reg_node(pRExC_state, OPFAIL);
8966                     nextchar(pRExC_state);
8967                     return ret;
8968                 }
8969                 break;
8970             case '|':           /* (?|...) */
8971                 /* branch reset, behave like a (?:...) except that
8972                    buffers in alternations share the same numbers */
8973                 paren = ':';
8974                 after_freeze = freeze_paren = RExC_npar;
8975                 break;
8976             case ':':           /* (?:...) */
8977             case '>':           /* (?>...) */
8978                 break;
8979             case '$':           /* (?$...) */
8980             case '@':           /* (?@...) */
8981                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8982                 break;
8983             case '#':           /* (?#...) */
8984                 /* XXX As soon as we disallow separating the '?' and '*' (by
8985                  * spaces or (?#...) comment), it is believed that this case
8986                  * will be unreachable and can be removed.  See
8987                  * [perl #117327] */
8988                 while (*RExC_parse && *RExC_parse != ')')
8989                     RExC_parse++;
8990                 if (*RExC_parse != ')')
8991                     FAIL("Sequence (?#... not terminated");
8992                 nextchar(pRExC_state);
8993                 *flagp = TRYAGAIN;
8994                 return NULL;
8995             case '0' :           /* (?0) */
8996             case 'R' :           /* (?R) */
8997                 if (*RExC_parse != ')')
8998                     FAIL("Sequence (?R) not terminated");
8999                 ret = reg_node(pRExC_state, GOSTART);
9000                 *flagp |= POSTPONED;
9001                 nextchar(pRExC_state);
9002                 return ret;
9003                 /*notreached*/
9004             { /* named and numeric backreferences */
9005                 I32 num;
9006             case '&':            /* (?&NAME) */
9007                 parse_start = RExC_parse - 1;
9008               named_recursion:
9009                 {
9010                     SV *sv_dat = reg_scan_name(pRExC_state,
9011                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9012                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9013                 }
9014                 goto gen_recurse_regop;
9015                 assert(0); /* NOT REACHED */
9016             case '+':
9017                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9018                     RExC_parse++;
9019                     vFAIL("Illegal pattern");
9020                 }
9021                 goto parse_recursion;
9022                 /* NOT REACHED*/
9023             case '-': /* (?-1) */
9024                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9025                     RExC_parse--; /* rewind to let it be handled later */
9026                     goto parse_flags;
9027                 }
9028                 /*FALLTHROUGH */
9029             case '1': case '2': case '3': case '4': /* (?1) */
9030             case '5': case '6': case '7': case '8': case '9':
9031                 RExC_parse--;
9032               parse_recursion:
9033                 num = atoi(RExC_parse);
9034                 parse_start = RExC_parse - 1; /* MJD */
9035                 if (*RExC_parse == '-')
9036                     RExC_parse++;
9037                 while (isDIGIT(*RExC_parse))
9038                         RExC_parse++;
9039                 if (*RExC_parse!=')')
9040                     vFAIL("Expecting close bracket");
9041
9042               gen_recurse_regop:
9043                 if ( paren == '-' ) {
9044                     /*
9045                     Diagram of capture buffer numbering.
9046                     Top line is the normal capture buffer numbers
9047                     Bottom line is the negative indexing as from
9048                     the X (the (?-2))
9049
9050                     +   1 2    3 4 5 X          6 7
9051                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9052                     -   5 4    3 2 1 X          x x
9053
9054                     */
9055                     num = RExC_npar + num;
9056                     if (num < 1)  {
9057                         RExC_parse++;
9058                         vFAIL("Reference to nonexistent group");
9059                     }
9060                 } else if ( paren == '+' ) {
9061                     num = RExC_npar + num - 1;
9062                 }
9063
9064                 ret = reganode(pRExC_state, GOSUB, num);
9065                 if (!SIZE_ONLY) {
9066                     if (num > (I32)RExC_rx->nparens) {
9067                         RExC_parse++;
9068                         vFAIL("Reference to nonexistent group");
9069                     }
9070                     ARG2L_SET( ret, RExC_recurse_count++);
9071                     RExC_emit++;
9072                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9073                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9074                 } else {
9075                     RExC_size++;
9076                 }
9077                 RExC_seen |= REG_SEEN_RECURSE;
9078                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9079                 Set_Node_Offset(ret, parse_start); /* MJD */
9080
9081                 *flagp |= POSTPONED;
9082                 nextchar(pRExC_state);
9083                 return ret;
9084             } /* named and numeric backreferences */
9085             assert(0); /* NOT REACHED */
9086
9087             case '?':           /* (??...) */
9088                 is_logical = 1;
9089                 if (*RExC_parse != '{') {
9090                     RExC_parse++;
9091                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9092                     /*NOTREACHED*/
9093                 }
9094                 *flagp |= POSTPONED;
9095                 paren = *RExC_parse++;
9096                 /* FALL THROUGH */
9097             case '{':           /* (?{...}) */
9098             {
9099                 U32 n = 0;
9100                 struct reg_code_block *cb;
9101
9102                 RExC_seen_zerolen++;
9103
9104                 if (   !pRExC_state->num_code_blocks
9105                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9106                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9107                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9108                             - RExC_start)
9109                 ) {
9110                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9111                         FAIL("panic: Sequence (?{...}): no code block found\n");
9112                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9113                 }
9114                 /* this is a pre-compiled code block (?{...}) */
9115                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9116                 RExC_parse = RExC_start + cb->end;
9117                 if (!SIZE_ONLY) {
9118                     OP *o = cb->block;
9119                     if (cb->src_regex) {
9120                         n = add_data(pRExC_state, 2, "rl");
9121                         RExC_rxi->data->data[n] =
9122                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9123                         RExC_rxi->data->data[n+1] = (void*)o;
9124                     }
9125                     else {
9126                         n = add_data(pRExC_state, 1,
9127                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9128                         RExC_rxi->data->data[n] = (void*)o;
9129                     }
9130                 }
9131                 pRExC_state->code_index++;
9132                 nextchar(pRExC_state);
9133
9134                 if (is_logical) {
9135                     regnode *eval;
9136                     ret = reg_node(pRExC_state, LOGICAL);
9137                     eval = reganode(pRExC_state, EVAL, n);
9138                     if (!SIZE_ONLY) {
9139                         ret->flags = 2;
9140                         /* for later propagation into (??{}) return value */
9141                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9142                     }
9143                     REGTAIL(pRExC_state, ret, eval);
9144                     /* deal with the length of this later - MJD */
9145                     return ret;
9146                 }
9147                 ret = reganode(pRExC_state, EVAL, n);
9148                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9149                 Set_Node_Offset(ret, parse_start);
9150                 return ret;
9151             }
9152             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9153             {
9154                 int is_define= 0;
9155                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9156                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9157                         || RExC_parse[1] == '<'
9158                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9159                         I32 flag;
9160                         regnode *tail;
9161
9162                         ret = reg_node(pRExC_state, LOGICAL);
9163                         if (!SIZE_ONLY)
9164                             ret->flags = 1;
9165
9166                         tail = reg(pRExC_state, 1, &flag, depth+1);
9167                         if (flag & RESTART_UTF8) {
9168                             *flagp = RESTART_UTF8;
9169                             return NULL;
9170                         }
9171                         REGTAIL(pRExC_state, ret, tail);
9172                         goto insert_if;
9173                     }
9174                 }
9175                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9176                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9177                 {
9178                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9179                     char *name_start= RExC_parse++;
9180                     U32 num = 0;
9181                     SV *sv_dat=reg_scan_name(pRExC_state,
9182                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9183                     if (RExC_parse == name_start || *RExC_parse != ch)
9184                         vFAIL2("Sequence (?(%c... not terminated",
9185                             (ch == '>' ? '<' : ch));
9186                     RExC_parse++;
9187                     if (!SIZE_ONLY) {
9188                         num = add_data( pRExC_state, 1, "S" );
9189                         RExC_rxi->data->data[num]=(void*)sv_dat;
9190                         SvREFCNT_inc_simple_void(sv_dat);
9191                     }
9192                     ret = reganode(pRExC_state,NGROUPP,num);
9193                     goto insert_if_check_paren;
9194                 }
9195                 else if (RExC_parse[0] == 'D' &&
9196                          RExC_parse[1] == 'E' &&
9197                          RExC_parse[2] == 'F' &&
9198                          RExC_parse[3] == 'I' &&
9199                          RExC_parse[4] == 'N' &&
9200                          RExC_parse[5] == 'E')
9201                 {
9202                     ret = reganode(pRExC_state,DEFINEP,0);
9203                     RExC_parse +=6 ;
9204                     is_define = 1;
9205                     goto insert_if_check_paren;
9206                 }
9207                 else if (RExC_parse[0] == 'R') {
9208                     RExC_parse++;
9209                     parno = 0;
9210                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9211                         parno = atoi(RExC_parse++);
9212                         while (isDIGIT(*RExC_parse))
9213                             RExC_parse++;
9214                     } else if (RExC_parse[0] == '&') {
9215                         SV *sv_dat;
9216                         RExC_parse++;
9217                         sv_dat = reg_scan_name(pRExC_state,
9218                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9219                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9220                     }
9221                     ret = reganode(pRExC_state,INSUBP,parno);
9222                     goto insert_if_check_paren;
9223                 }
9224                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9225                     /* (?(1)...) */
9226                     char c;
9227                     parno = atoi(RExC_parse++);
9228
9229                     while (isDIGIT(*RExC_parse))
9230                         RExC_parse++;
9231                     ret = reganode(pRExC_state, GROUPP, parno);
9232
9233                  insert_if_check_paren:
9234                     if ((c = *nextchar(pRExC_state)) != ')')
9235                         vFAIL("Switch condition not recognized");
9236                   insert_if:
9237                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9238                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9239                     if (br == NULL) {
9240                         if (flags & RESTART_UTF8) {
9241                             *flagp = RESTART_UTF8;
9242                             return NULL;
9243                         }
9244                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9245                               (UV) flags);
9246                     } else
9247                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9248                     c = *nextchar(pRExC_state);
9249                     if (flags&HASWIDTH)
9250                         *flagp |= HASWIDTH;
9251                     if (c == '|') {
9252                         if (is_define)
9253                             vFAIL("(?(DEFINE)....) does not allow branches");
9254                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9255                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9256                             if (flags & RESTART_UTF8) {
9257                                 *flagp = RESTART_UTF8;
9258                                 return NULL;
9259                             }
9260                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9261                                   (UV) flags);
9262                         }
9263                         REGTAIL(pRExC_state, ret, lastbr);
9264                         if (flags&HASWIDTH)
9265                             *flagp |= HASWIDTH;
9266                         c = *nextchar(pRExC_state);
9267                     }
9268                     else
9269                         lastbr = NULL;
9270                     if (c != ')')
9271                         vFAIL("Switch (?(condition)... contains too many branches");
9272                     ender = reg_node(pRExC_state, TAIL);
9273                     REGTAIL(pRExC_state, br, ender);
9274                     if (lastbr) {
9275                         REGTAIL(pRExC_state, lastbr, ender);
9276                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9277                     }
9278                     else
9279                         REGTAIL(pRExC_state, ret, ender);
9280                     RExC_size++; /* XXX WHY do we need this?!!
9281                                     For large programs it seems to be required
9282                                     but I can't figure out why. -- dmq*/
9283                     return ret;
9284                 }
9285                 else {
9286                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9287                 }
9288             }
9289             case '[':           /* (?[ ... ]) */
9290                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9291                                          oregcomp_parse);
9292             case 0:
9293                 RExC_parse--; /* for vFAIL to print correctly */
9294                 vFAIL("Sequence (? incomplete");
9295                 break;
9296             default: /* e.g., (?i) */
9297                 --RExC_parse;
9298               parse_flags:
9299                 parse_lparen_question_flags(pRExC_state);
9300                 if (UCHARAT(RExC_parse) != ':') {
9301                     nextchar(pRExC_state);
9302                     *flagp = TRYAGAIN;
9303                     return NULL;
9304                 }
9305                 paren = ':';
9306                 nextchar(pRExC_state);
9307                 ret = NULL;
9308                 goto parse_rest;
9309             } /* end switch */
9310         }
9311         else {                  /* (...) */
9312           capturing_parens:
9313             parno = RExC_npar;
9314             RExC_npar++;
9315
9316             ret = reganode(pRExC_state, OPEN, parno);
9317             if (!SIZE_ONLY ){
9318                 if (!RExC_nestroot)
9319                     RExC_nestroot = parno;
9320                 if (RExC_seen & REG_SEEN_RECURSE
9321                     && !RExC_open_parens[parno-1])
9322                 {
9323                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9324                         "Setting open paren #%"IVdf" to %d\n",
9325                         (IV)parno, REG_NODE_NUM(ret)));
9326                     RExC_open_parens[parno-1]= ret;
9327                 }
9328             }
9329             Set_Node_Length(ret, 1); /* MJD */
9330             Set_Node_Offset(ret, RExC_parse); /* MJD */
9331             is_open = 1;
9332         }
9333     }
9334     else                        /* ! paren */
9335         ret = NULL;
9336
9337    parse_rest:
9338     /* Pick up the branches, linking them together. */
9339     parse_start = RExC_parse;   /* MJD */
9340     br = regbranch(pRExC_state, &flags, 1,depth+1);
9341
9342     /*     branch_len = (paren != 0); */
9343
9344     if (br == NULL) {
9345         if (flags & RESTART_UTF8) {
9346             *flagp = RESTART_UTF8;
9347             return NULL;
9348         }
9349         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9350     }
9351     if (*RExC_parse == '|') {
9352         if (!SIZE_ONLY && RExC_extralen) {
9353             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9354         }
9355         else {                  /* MJD */
9356             reginsert(pRExC_state, BRANCH, br, depth+1);
9357             Set_Node_Length(br, paren != 0);
9358             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9359         }
9360         have_branch = 1;
9361         if (SIZE_ONLY)
9362             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9363     }
9364     else if (paren == ':') {
9365         *flagp |= flags&SIMPLE;
9366     }
9367     if (is_open) {                              /* Starts with OPEN. */
9368         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9369     }
9370     else if (paren != '?')              /* Not Conditional */
9371         ret = br;
9372     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9373     lastbr = br;
9374     while (*RExC_parse == '|') {
9375         if (!SIZE_ONLY && RExC_extralen) {
9376             ender = reganode(pRExC_state, LONGJMP,0);
9377             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9378         }
9379         if (SIZE_ONLY)
9380             RExC_extralen += 2;         /* Account for LONGJMP. */
9381         nextchar(pRExC_state);
9382         if (freeze_paren) {
9383             if (RExC_npar > after_freeze)
9384                 after_freeze = RExC_npar;
9385             RExC_npar = freeze_paren;
9386         }
9387         br = regbranch(pRExC_state, &flags, 0, depth+1);
9388
9389         if (br == NULL) {
9390             if (flags & RESTART_UTF8) {
9391                 *flagp = RESTART_UTF8;
9392                 return NULL;
9393             }
9394             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9395         }
9396         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9397         lastbr = br;
9398         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9399     }
9400
9401     if (have_branch || paren != ':') {
9402         /* Make a closing node, and hook it on the end. */
9403         switch (paren) {
9404         case ':':
9405             ender = reg_node(pRExC_state, TAIL);
9406             break;
9407         case 1: case 2:
9408             ender = reganode(pRExC_state, CLOSE, parno);
9409             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9410                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9411                         "Setting close paren #%"IVdf" to %d\n",
9412                         (IV)parno, REG_NODE_NUM(ender)));
9413                 RExC_close_parens[parno-1]= ender;
9414                 if (RExC_nestroot == parno)
9415                     RExC_nestroot = 0;
9416             }
9417             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9418             Set_Node_Length(ender,1); /* MJD */
9419             break;
9420         case '<':
9421         case ',':
9422         case '=':
9423         case '!':
9424             *flagp &= ~HASWIDTH;
9425             /* FALL THROUGH */
9426         case '>':
9427             ender = reg_node(pRExC_state, SUCCEED);
9428             break;
9429         case 0:
9430             ender = reg_node(pRExC_state, END);
9431             if (!SIZE_ONLY) {
9432                 assert(!RExC_opend); /* there can only be one! */
9433                 RExC_opend = ender;
9434             }
9435             break;
9436         }
9437         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9438             SV * const mysv_val1=sv_newmortal();
9439             SV * const mysv_val2=sv_newmortal();
9440             DEBUG_PARSE_MSG("lsbr");
9441             regprop(RExC_rx, mysv_val1, lastbr);
9442             regprop(RExC_rx, mysv_val2, ender);
9443             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9444                           SvPV_nolen_const(mysv_val1),
9445                           (IV)REG_NODE_NUM(lastbr),
9446                           SvPV_nolen_const(mysv_val2),
9447                           (IV)REG_NODE_NUM(ender),
9448                           (IV)(ender - lastbr)
9449             );
9450         });
9451         REGTAIL(pRExC_state, lastbr, ender);
9452
9453         if (have_branch && !SIZE_ONLY) {
9454             char is_nothing= 1;
9455             if (depth==1)
9456                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9457
9458             /* Hook the tails of the branches to the closing node. */
9459             for (br = ret; br; br = regnext(br)) {
9460                 const U8 op = PL_regkind[OP(br)];
9461                 if (op == BRANCH) {
9462                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9463                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9464                         is_nothing= 0;
9465                 }
9466                 else if (op == BRANCHJ) {
9467                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9468                     /* for now we always disable this optimisation * /
9469                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9470                     */
9471                         is_nothing= 0;
9472                 }
9473             }
9474             if (is_nothing) {
9475                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9476                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9477                     SV * const mysv_val1=sv_newmortal();
9478                     SV * const mysv_val2=sv_newmortal();
9479                     DEBUG_PARSE_MSG("NADA");
9480                     regprop(RExC_rx, mysv_val1, ret);
9481                     regprop(RExC_rx, mysv_val2, ender);
9482                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9483                                   SvPV_nolen_const(mysv_val1),
9484                                   (IV)REG_NODE_NUM(ret),
9485                                   SvPV_nolen_const(mysv_val2),
9486                                   (IV)REG_NODE_NUM(ender),
9487                                   (IV)(ender - ret)
9488                     );
9489                 });
9490                 OP(br)= NOTHING;
9491                 if (OP(ender) == TAIL) {
9492                     NEXT_OFF(br)= 0;
9493                     RExC_emit= br + 1;
9494                 } else {
9495                     regnode *opt;
9496                     for ( opt= br + 1; opt < ender ; opt++ )
9497                         OP(opt)= OPTIMIZED;
9498                     NEXT_OFF(br)= ender - br;
9499                 }
9500             }
9501         }
9502     }
9503
9504     {
9505         const char *p;
9506         static const char parens[] = "=!<,>";
9507
9508         if (paren && (p = strchr(parens, paren))) {
9509             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9510             int flag = (p - parens) > 1;
9511
9512             if (paren == '>')
9513                 node = SUSPEND, flag = 0;
9514             reginsert(pRExC_state, node,ret, depth+1);
9515             Set_Node_Cur_Length(ret, parse_start);
9516             Set_Node_Offset(ret, parse_start + 1);
9517             ret->flags = flag;
9518             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9519         }
9520     }
9521
9522     /* Check for proper termination. */
9523     if (paren) {
9524         /* restore original flags, but keep (?p) */
9525         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9526         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9527             RExC_parse = oregcomp_parse;
9528             vFAIL("Unmatched (");
9529         }
9530     }
9531     else if (!paren && RExC_parse < RExC_end) {
9532         if (*RExC_parse == ')') {
9533             RExC_parse++;
9534             vFAIL("Unmatched )");
9535         }
9536         else
9537             FAIL("Junk on end of regexp");      /* "Can't happen". */
9538         assert(0); /* NOTREACHED */
9539     }
9540
9541     if (RExC_in_lookbehind) {
9542         RExC_in_lookbehind--;
9543     }
9544     if (after_freeze > RExC_npar)
9545         RExC_npar = after_freeze;
9546     return(ret);
9547 }
9548
9549 /*
9550  - regbranch - one alternative of an | operator
9551  *
9552  * Implements the concatenation operator.
9553  *
9554  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9555  * restarted.
9556  */
9557 STATIC regnode *
9558 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9559 {
9560     dVAR;
9561     regnode *ret;
9562     regnode *chain = NULL;
9563     regnode *latest;
9564     I32 flags = 0, c = 0;
9565     GET_RE_DEBUG_FLAGS_DECL;
9566
9567     PERL_ARGS_ASSERT_REGBRANCH;
9568
9569     DEBUG_PARSE("brnc");
9570
9571     if (first)
9572         ret = NULL;
9573     else {
9574         if (!SIZE_ONLY && RExC_extralen)
9575             ret = reganode(pRExC_state, BRANCHJ,0);
9576         else {
9577             ret = reg_node(pRExC_state, BRANCH);
9578             Set_Node_Length(ret, 1);
9579         }
9580     }
9581
9582     if (!first && SIZE_ONLY)
9583         RExC_extralen += 1;                     /* BRANCHJ */
9584
9585     *flagp = WORST;                     /* Tentatively. */
9586
9587     RExC_parse--;
9588     nextchar(pRExC_state);
9589     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9590         flags &= ~TRYAGAIN;
9591         latest = regpiece(pRExC_state, &flags,depth+1);
9592         if (latest == NULL) {
9593             if (flags & TRYAGAIN)
9594                 continue;
9595             if (flags & RESTART_UTF8) {
9596                 *flagp = RESTART_UTF8;
9597                 return NULL;
9598             }
9599             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9600         }
9601         else if (ret == NULL)
9602             ret = latest;
9603         *flagp |= flags&(HASWIDTH|POSTPONED);
9604         if (chain == NULL)      /* First piece. */
9605             *flagp |= flags&SPSTART;
9606         else {
9607             RExC_naughty++;
9608             REGTAIL(pRExC_state, chain, latest);
9609         }
9610         chain = latest;
9611         c++;
9612     }
9613     if (chain == NULL) {        /* Loop ran zero times. */
9614         chain = reg_node(pRExC_state, NOTHING);
9615         if (ret == NULL)
9616             ret = chain;
9617     }
9618     if (c == 1) {
9619         *flagp |= flags&SIMPLE;
9620     }
9621
9622     return ret;
9623 }
9624
9625 /*
9626  - regpiece - something followed by possible [*+?]
9627  *
9628  * Note that the branching code sequences used for ? and the general cases
9629  * of * and + are somewhat optimized:  they use the same NOTHING node as
9630  * both the endmarker for their branch list and the body of the last branch.
9631  * It might seem that this node could be dispensed with entirely, but the
9632  * endmarker role is not redundant.
9633  *
9634  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9635  * TRYAGAIN.
9636  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9637  * restarted.
9638  */
9639 STATIC regnode *
9640 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9641 {
9642     dVAR;
9643     regnode *ret;
9644     char op;
9645     char *next;
9646     I32 flags;
9647     const char * const origparse = RExC_parse;
9648     I32 min;
9649     I32 max = REG_INFTY;
9650 #ifdef RE_TRACK_PATTERN_OFFSETS
9651     char *parse_start;
9652 #endif
9653     const char *maxpos = NULL;
9654
9655     /* Save the original in case we change the emitted regop to a FAIL. */
9656     regnode * const orig_emit = RExC_emit;
9657
9658     GET_RE_DEBUG_FLAGS_DECL;
9659
9660     PERL_ARGS_ASSERT_REGPIECE;
9661
9662     DEBUG_PARSE("piec");
9663
9664     ret = regatom(pRExC_state, &flags,depth+1);
9665     if (ret == NULL) {
9666         if (flags & (TRYAGAIN|RESTART_UTF8))
9667             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9668         else
9669             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9670         return(NULL);
9671     }
9672
9673     op = *RExC_parse;
9674
9675     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9676         maxpos = NULL;
9677 #ifdef RE_TRACK_PATTERN_OFFSETS
9678         parse_start = RExC_parse; /* MJD */
9679 #endif
9680         next = RExC_parse + 1;
9681         while (isDIGIT(*next) || *next == ',') {
9682             if (*next == ',') {
9683                 if (maxpos)
9684                     break;
9685                 else
9686                     maxpos = next;
9687             }
9688             next++;
9689         }
9690         if (*next == '}') {             /* got one */
9691             if (!maxpos)
9692                 maxpos = next;
9693             RExC_parse++;
9694             min = atoi(RExC_parse);
9695             if (*maxpos == ',')
9696                 maxpos++;
9697             else
9698                 maxpos = RExC_parse;
9699             max = atoi(maxpos);
9700             if (!max && *maxpos != '0')
9701                 max = REG_INFTY;                /* meaning "infinity" */
9702             else if (max >= REG_INFTY)
9703                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9704             RExC_parse = next;
9705             nextchar(pRExC_state);
9706             if (max < min) {    /* If can't match, warn and optimize to fail
9707                                    unconditionally */
9708                 if (SIZE_ONLY) {
9709                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9710
9711                     /* We can't back off the size because we have to reserve
9712                      * enough space for all the things we are about to throw
9713                      * away, but we can shrink it by the ammount we are about
9714                      * to re-use here */
9715                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9716                 }
9717                 else {
9718                     RExC_emit = orig_emit;
9719                 }
9720                 ret = reg_node(pRExC_state, OPFAIL);
9721                 return ret;
9722             }
9723
9724         do_curly:
9725             if ((flags&SIMPLE)) {
9726                 RExC_naughty += 2 + RExC_naughty / 2;
9727                 reginsert(pRExC_state, CURLY, ret, depth+1);
9728                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9729                 Set_Node_Cur_Length(ret, parse_start);
9730             }
9731             else {
9732                 regnode * const w = reg_node(pRExC_state, WHILEM);
9733
9734                 w->flags = 0;
9735                 REGTAIL(pRExC_state, ret, w);
9736                 if (!SIZE_ONLY && RExC_extralen) {
9737                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9738                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9739                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9740                 }
9741                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9742                                 /* MJD hk */
9743                 Set_Node_Offset(ret, parse_start+1);
9744                 Set_Node_Length(ret,
9745                                 op == '{' ? (RExC_parse - parse_start) : 1);
9746
9747                 if (!SIZE_ONLY && RExC_extralen)
9748                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9749                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9750                 if (SIZE_ONLY)
9751                     RExC_whilem_seen++, RExC_extralen += 3;
9752                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9753             }
9754             ret->flags = 0;
9755
9756             if (min > 0)
9757                 *flagp = WORST;
9758             if (max > 0)
9759                 *flagp |= HASWIDTH;
9760             if (!SIZE_ONLY) {
9761                 ARG1_SET(ret, (U16)min);
9762                 ARG2_SET(ret, (U16)max);
9763             }
9764
9765             goto nest_check;
9766         }
9767     }
9768
9769     if (!ISMULT1(op)) {
9770         *flagp = flags;
9771         return(ret);
9772     }
9773
9774 #if 0                           /* Now runtime fix should be reliable. */
9775
9776     /* if this is reinstated, don't forget to put this back into perldiag:
9777
9778             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9779
9780            (F) The part of the regexp subject to either the * or + quantifier
9781            could match an empty string. The {#} shows in the regular
9782            expression about where the problem was discovered.
9783
9784     */
9785
9786     if (!(flags&HASWIDTH) && op != '?')
9787       vFAIL("Regexp *+ operand could be empty");
9788 #endif
9789
9790 #ifdef RE_TRACK_PATTERN_OFFSETS
9791     parse_start = RExC_parse;
9792 #endif
9793     nextchar(pRExC_state);
9794
9795     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9796
9797     if (op == '*' && (flags&SIMPLE)) {
9798         reginsert(pRExC_state, STAR, ret, depth+1);
9799         ret->flags = 0;
9800         RExC_naughty += 4;
9801     }
9802     else if (op == '*') {
9803         min = 0;
9804         goto do_curly;
9805     }
9806     else if (op == '+' && (flags&SIMPLE)) {
9807         reginsert(pRExC_state, PLUS, ret, depth+1);
9808         ret->flags = 0;
9809         RExC_naughty += 3;
9810     }
9811     else if (op == '+') {
9812         min = 1;
9813         goto do_curly;
9814     }
9815     else if (op == '?') {
9816         min = 0; max = 1;
9817         goto do_curly;
9818     }
9819   nest_check:
9820     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9821         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9822         ckWARN3reg(RExC_parse,
9823                    "%.*s matches null string many times",
9824                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9825                    origparse);
9826         (void)ReREFCNT_inc(RExC_rx_sv);
9827     }
9828
9829     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9830         nextchar(pRExC_state);
9831         reginsert(pRExC_state, MINMOD, ret, depth+1);
9832         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9833     }
9834     else
9835     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9836         regnode *ender;
9837         nextchar(pRExC_state);
9838         ender = reg_node(pRExC_state, SUCCEED);
9839         REGTAIL(pRExC_state, ret, ender);
9840         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9841         ret->flags = 0;
9842         ender = reg_node(pRExC_state, TAIL);
9843         REGTAIL(pRExC_state, ret, ender);
9844     }
9845
9846     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9847         RExC_parse++;
9848         vFAIL("Nested quantifiers");
9849     }
9850
9851     return(ret);
9852 }
9853
9854 STATIC bool
9855 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9856         const bool strict   /* Apply stricter parsing rules? */
9857     )
9858 {
9859
9860  /* This is expected to be called by a parser routine that has recognized '\N'
9861    and needs to handle the rest. RExC_parse is expected to point at the first
9862    char following the N at the time of the call.  On successful return,
9863    RExC_parse has been updated to point to just after the sequence identified
9864    by this routine, and <*flagp> has been updated.
9865
9866    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9867    character class.
9868
9869    \N may begin either a named sequence, or if outside a character class, mean
9870    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9871    attempted to decide which, and in the case of a named sequence, converted it
9872    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9873    where c1... are the characters in the sequence.  For single-quoted regexes,
9874    the tokenizer passes the \N sequence through unchanged; this code will not
9875    attempt to determine this nor expand those, instead raising a syntax error.
9876    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9877    or there is no '}', it signals that this \N occurrence means to match a
9878    non-newline.
9879
9880    Only the \N{U+...} form should occur in a character class, for the same
9881    reason that '.' inside a character class means to just match a period: it
9882    just doesn't make sense.
9883
9884    The function raises an error (via vFAIL), and doesn't return for various
9885    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9886    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9887    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9888    only possible if node_p is non-NULL.
9889
9890
9891    If <valuep> is non-null, it means the caller can accept an input sequence
9892    consisting of a just a single code point; <*valuep> is set to that value
9893    if the input is such.
9894
9895    If <node_p> is non-null it signifies that the caller can accept any other
9896    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9897    is set as follows:
9898     1) \N means not-a-NL: points to a newly created REG_ANY node;
9899     2) \N{}:              points to a new NOTHING node;
9900     3) otherwise:         points to a new EXACT node containing the resolved
9901                           string.
9902    Note that FALSE is returned for single code point sequences if <valuep> is
9903    null.
9904  */
9905
9906     char * endbrace;    /* '}' following the name */
9907     char* p;
9908     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9909                            stream */
9910     bool has_multiple_chars; /* true if the input stream contains a sequence of
9911                                 more than one character */
9912
9913     GET_RE_DEBUG_FLAGS_DECL;
9914
9915     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9916
9917     GET_RE_DEBUG_FLAGS;
9918
9919     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9920
9921     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9922      * modifier.  The other meaning does not */
9923     p = (RExC_flags & RXf_PMf_EXTENDED)
9924         ? regwhite( pRExC_state, RExC_parse )
9925         : RExC_parse;
9926
9927     /* Disambiguate between \N meaning a named character versus \N meaning
9928      * [^\n].  The former is assumed when it can't be the latter. */
9929     if (*p != '{' || regcurly(p, FALSE)) {
9930         RExC_parse = p;
9931         if (! node_p) {
9932             /* no bare \N in a charclass */
9933             if (in_char_class) {
9934                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9935             }
9936             return FALSE;
9937         }
9938         nextchar(pRExC_state);
9939         *node_p = reg_node(pRExC_state, REG_ANY);
9940         *flagp |= HASWIDTH|SIMPLE;
9941         RExC_naughty++;
9942         RExC_parse--;
9943         Set_Node_Length(*node_p, 1); /* MJD */
9944         return TRUE;
9945     }
9946
9947     /* Here, we have decided it should be a named character or sequence */
9948
9949     /* The test above made sure that the next real character is a '{', but
9950      * under the /x modifier, it could be separated by space (or a comment and
9951      * \n) and this is not allowed (for consistency with \x{...} and the
9952      * tokenizer handling of \N{NAME}). */
9953     if (*RExC_parse != '{') {
9954         vFAIL("Missing braces on \\N{}");
9955     }
9956
9957     RExC_parse++;       /* Skip past the '{' */
9958
9959     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9960         || ! (endbrace == RExC_parse            /* nothing between the {} */
9961               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9962                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9963     {
9964         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9965         vFAIL("\\N{NAME} must be resolved by the lexer");
9966     }
9967
9968     if (endbrace == RExC_parse) {   /* empty: \N{} */
9969         bool ret = TRUE;
9970         if (node_p) {
9971             *node_p = reg_node(pRExC_state,NOTHING);
9972         }
9973         else if (in_char_class) {
9974             if (SIZE_ONLY && in_char_class) {
9975                 if (strict) {
9976                     RExC_parse++;   /* Position after the "}" */
9977                     vFAIL("Zero length \\N{}");
9978                 }
9979                 else {
9980                     ckWARNreg(RExC_parse,
9981                               "Ignoring zero length \\N{} in character class");
9982                 }
9983             }
9984             ret = FALSE;
9985         }
9986         else {
9987             return FALSE;
9988         }
9989         nextchar(pRExC_state);
9990         return ret;
9991     }
9992
9993     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9994     RExC_parse += 2;    /* Skip past the 'U+' */
9995
9996     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9997
9998     /* Code points are separated by dots.  If none, there is only one code
9999      * point, and is terminated by the brace */
10000     has_multiple_chars = (endchar < endbrace);
10001
10002     if (valuep && (! has_multiple_chars || in_char_class)) {
10003         /* We only pay attention to the first char of
10004         multichar strings being returned in char classes. I kinda wonder
10005         if this makes sense as it does change the behaviour
10006         from earlier versions, OTOH that behaviour was broken
10007         as well. XXX Solution is to recharacterize as
10008         [rest-of-class]|multi1|multi2... */
10009
10010         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10011         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10012             | PERL_SCAN_DISALLOW_PREFIX
10013             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10014
10015         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10016
10017         /* The tokenizer should have guaranteed validity, but it's possible to
10018          * bypass it by using single quoting, so check */
10019         if (length_of_hex == 0
10020             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10021         {
10022             RExC_parse += length_of_hex;        /* Includes all the valid */
10023             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10024                             ? UTF8SKIP(RExC_parse)
10025                             : 1;
10026             /* Guard against malformed utf8 */
10027             if (RExC_parse >= endchar) {
10028                 RExC_parse = endchar;
10029             }
10030             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10031         }
10032
10033         if (in_char_class && has_multiple_chars) {
10034             if (strict) {
10035                 RExC_parse = endbrace;
10036                 vFAIL("\\N{} in character class restricted to one character");
10037             }
10038             else {
10039                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10040             }
10041         }
10042
10043         RExC_parse = endbrace + 1;
10044     }
10045     else if (! node_p || ! has_multiple_chars) {
10046
10047         /* Here, the input is legal, but not according to the caller's
10048          * options.  We fail without advancing the parse, so that the
10049          * caller can try again */
10050         RExC_parse = p;
10051         return FALSE;
10052     }
10053     else {
10054
10055         /* What is done here is to convert this to a sub-pattern of the form
10056          * (?:\x{char1}\x{char2}...)
10057          * and then call reg recursively.  That way, it retains its atomicness,
10058          * while not having to worry about special handling that some code
10059          * points may have.  toke.c has converted the original Unicode values
10060          * to native, so that we can just pass on the hex values unchanged.  We
10061          * do have to set a flag to keep recoding from happening in the
10062          * recursion */
10063
10064         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10065         STRLEN len;
10066         char *orig_end = RExC_end;
10067         I32 flags;
10068
10069         while (RExC_parse < endbrace) {
10070
10071             /* Convert to notation the rest of the code understands */
10072             sv_catpv(substitute_parse, "\\x{");
10073             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10074             sv_catpv(substitute_parse, "}");
10075
10076             /* Point to the beginning of the next character in the sequence. */
10077             RExC_parse = endchar + 1;
10078             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10079         }
10080         sv_catpv(substitute_parse, ")");
10081
10082         RExC_parse = SvPV(substitute_parse, len);
10083
10084         /* Don't allow empty number */
10085         if (len < 8) {
10086             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10087         }
10088         RExC_end = RExC_parse + len;
10089
10090         /* The values are Unicode, and therefore not subject to recoding */
10091         RExC_override_recoding = 1;
10092
10093         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10094             if (flags & RESTART_UTF8) {
10095                 *flagp = RESTART_UTF8;
10096                 return FALSE;
10097             }
10098             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10099                   (UV) flags);
10100         }
10101         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10102
10103         RExC_parse = endbrace;
10104         RExC_end = orig_end;
10105         RExC_override_recoding = 0;
10106
10107         nextchar(pRExC_state);
10108     }
10109
10110     return TRUE;
10111 }
10112
10113
10114 /*
10115  * reg_recode
10116  *
10117  * It returns the code point in utf8 for the value in *encp.
10118  *    value: a code value in the source encoding
10119  *    encp:  a pointer to an Encode object
10120  *
10121  * If the result from Encode is not a single character,
10122  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10123  */
10124 STATIC UV
10125 S_reg_recode(pTHX_ const char value, SV **encp)
10126 {
10127     STRLEN numlen = 1;
10128     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10129     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10130     const STRLEN newlen = SvCUR(sv);
10131     UV uv = UNICODE_REPLACEMENT;
10132
10133     PERL_ARGS_ASSERT_REG_RECODE;
10134
10135     if (newlen)
10136         uv = SvUTF8(sv)
10137              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10138              : *(U8*)s;
10139
10140     if (!newlen || numlen != newlen) {
10141         uv = UNICODE_REPLACEMENT;
10142         *encp = NULL;
10143     }
10144     return uv;
10145 }
10146
10147 PERL_STATIC_INLINE U8
10148 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10149 {
10150     U8 op;
10151
10152     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10153
10154     if (! FOLD) {
10155         return EXACT;
10156     }
10157
10158     op = get_regex_charset(RExC_flags);
10159     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10160         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10161                  been, so there is no hole */
10162     }
10163
10164     return op + EXACTF;
10165 }
10166
10167 PERL_STATIC_INLINE void
10168 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10169 {
10170     /* This knows the details about sizing an EXACTish node, setting flags for
10171      * it (by setting <*flagp>, and potentially populating it with a single
10172      * character.
10173      *
10174      * If <len> (the length in bytes) is non-zero, this function assumes that
10175      * the node has already been populated, and just does the sizing.  In this
10176      * case <code_point> should be the final code point that has already been
10177      * placed into the node.  This value will be ignored except that under some
10178      * circumstances <*flagp> is set based on it.
10179      *
10180      * If <len> is zero, the function assumes that the node is to contain only
10181      * the single character given by <code_point> and calculates what <len>
10182      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10183      * additionally will populate the node's STRING with <code_point>, if <len>
10184      * is 0.  In both cases <*flagp> is appropriately set
10185      *
10186      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10187      * 255, must be folded (the former only when the rules indicate it can
10188      * match 'ss') */
10189
10190     bool len_passed_in = cBOOL(len != 0);
10191     U8 character[UTF8_MAXBYTES_CASE+1];
10192
10193     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10194
10195     if (! len_passed_in) {
10196         if (UTF) {
10197             if (FOLD && (! LOC || code_point > 255)) {
10198                 _to_uni_fold_flags(code_point,
10199                                    character,
10200                                    &len,
10201                                    FOLD_FLAGS_FULL | ((LOC)
10202                                                      ? FOLD_FLAGS_LOCALE
10203                                                      : (ASCII_FOLD_RESTRICTED)
10204                                                        ? FOLD_FLAGS_NOMIX_ASCII
10205                                                        : 0));
10206             }
10207             else {
10208                 uvchr_to_utf8( character, code_point);
10209                 len = UTF8SKIP(character);
10210             }
10211         }
10212         else if (! FOLD
10213                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10214                  || ASCII_FOLD_RESTRICTED
10215                  || ! AT_LEAST_UNI_SEMANTICS)
10216         {
10217             *character = (U8) code_point;
10218             len = 1;
10219         }
10220         else {
10221             *character = 's';
10222             *(character + 1) = 's';
10223             len = 2;
10224         }
10225     }
10226
10227     if (SIZE_ONLY) {
10228         RExC_size += STR_SZ(len);
10229     }
10230     else {
10231         RExC_emit += STR_SZ(len);
10232         STR_LEN(node) = len;
10233         if (! len_passed_in) {
10234             Copy((char *) character, STRING(node), len, char);
10235         }
10236     }
10237
10238     *flagp |= HASWIDTH;
10239
10240     /* A single character node is SIMPLE, except for the special-cased SHARP S
10241      * under /di. */
10242     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10243         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10244             || ! FOLD || ! DEPENDS_SEMANTICS))
10245     {
10246         *flagp |= SIMPLE;
10247     }
10248 }
10249
10250 /*
10251  - regatom - the lowest level
10252
10253    Try to identify anything special at the start of the pattern. If there
10254    is, then handle it as required. This may involve generating a single regop,
10255    such as for an assertion; or it may involve recursing, such as to
10256    handle a () structure.
10257
10258    If the string doesn't start with something special then we gobble up
10259    as much literal text as we can.
10260
10261    Once we have been able to handle whatever type of thing started the
10262    sequence, we return.
10263
10264    Note: we have to be careful with escapes, as they can be both literal
10265    and special, and in the case of \10 and friends, context determines which.
10266
10267    A summary of the code structure is:
10268
10269    switch (first_byte) {
10270         cases for each special:
10271             handle this special;
10272             break;
10273         case '\\':
10274             switch (2nd byte) {
10275                 cases for each unambiguous special:
10276                     handle this special;
10277                     break;
10278                 cases for each ambigous special/literal:
10279                     disambiguate;
10280                     if (special)  handle here
10281                     else goto defchar;
10282                 default: // unambiguously literal:
10283                     goto defchar;
10284             }
10285         default:  // is a literal char
10286             // FALL THROUGH
10287         defchar:
10288             create EXACTish node for literal;
10289             while (more input and node isn't full) {
10290                 switch (input_byte) {
10291                    cases for each special;
10292                        make sure parse pointer is set so that the next call to
10293                            regatom will see this special first
10294                        goto loopdone; // EXACTish node terminated by prev. char
10295                    default:
10296                        append char to EXACTISH node;
10297                 }
10298                 get next input byte;
10299             }
10300         loopdone:
10301    }
10302    return the generated node;
10303
10304    Specifically there are two separate switches for handling
10305    escape sequences, with the one for handling literal escapes requiring
10306    a dummy entry for all of the special escapes that are actually handled
10307    by the other.
10308
10309    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10310    TRYAGAIN.
10311    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10312    restarted.
10313    Otherwise does not return NULL.
10314 */
10315
10316 STATIC regnode *
10317 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10318 {
10319     dVAR;
10320     regnode *ret = NULL;
10321     I32 flags = 0;
10322     char *parse_start = RExC_parse;
10323     U8 op;
10324     int invert = 0;
10325
10326     GET_RE_DEBUG_FLAGS_DECL;
10327
10328     *flagp = WORST;             /* Tentatively. */
10329
10330     DEBUG_PARSE("atom");
10331
10332     PERL_ARGS_ASSERT_REGATOM;
10333
10334 tryagain:
10335     switch ((U8)*RExC_parse) {
10336     case '^':
10337         RExC_seen_zerolen++;
10338         nextchar(pRExC_state);
10339         if (RExC_flags & RXf_PMf_MULTILINE)
10340             ret = reg_node(pRExC_state, MBOL);
10341         else if (RExC_flags & RXf_PMf_SINGLELINE)
10342             ret = reg_node(pRExC_state, SBOL);
10343         else
10344             ret = reg_node(pRExC_state, BOL);
10345         Set_Node_Length(ret, 1); /* MJD */
10346         break;
10347     case '$':
10348         nextchar(pRExC_state);
10349         if (*RExC_parse)
10350             RExC_seen_zerolen++;
10351         if (RExC_flags & RXf_PMf_MULTILINE)
10352             ret = reg_node(pRExC_state, MEOL);
10353         else if (RExC_flags & RXf_PMf_SINGLELINE)
10354             ret = reg_node(pRExC_state, SEOL);
10355         else
10356             ret = reg_node(pRExC_state, EOL);
10357         Set_Node_Length(ret, 1); /* MJD */
10358         break;
10359     case '.':
10360         nextchar(pRExC_state);
10361         if (RExC_flags & RXf_PMf_SINGLELINE)
10362             ret = reg_node(pRExC_state, SANY);
10363         else
10364             ret = reg_node(pRExC_state, REG_ANY);
10365         *flagp |= HASWIDTH|SIMPLE;
10366         RExC_naughty++;
10367         Set_Node_Length(ret, 1); /* MJD */
10368         break;
10369     case '[':
10370     {
10371         char * const oregcomp_parse = ++RExC_parse;
10372         ret = regclass(pRExC_state, flagp,depth+1,
10373                        FALSE, /* means parse the whole char class */
10374                        TRUE, /* allow multi-char folds */
10375                        FALSE, /* don't silence non-portable warnings. */
10376                        NULL);
10377         if (*RExC_parse != ']') {
10378             RExC_parse = oregcomp_parse;
10379             vFAIL("Unmatched [");
10380         }
10381         if (ret == NULL) {
10382             if (*flagp & RESTART_UTF8)
10383                 return NULL;
10384             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10385                   (UV) *flagp);
10386         }
10387         nextchar(pRExC_state);
10388         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10389         break;
10390     }
10391     case '(':
10392         nextchar(pRExC_state);
10393         ret = reg(pRExC_state, 2, &flags,depth+1);
10394         if (ret == NULL) {
10395                 if (flags & TRYAGAIN) {
10396                     if (RExC_parse == RExC_end) {
10397                          /* Make parent create an empty node if needed. */
10398                         *flagp |= TRYAGAIN;
10399                         return(NULL);
10400                     }
10401                     goto tryagain;
10402                 }
10403                 if (flags & RESTART_UTF8) {
10404                     *flagp = RESTART_UTF8;
10405                     return NULL;
10406                 }
10407                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10408         }
10409         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10410         break;
10411     case '|':
10412     case ')':
10413         if (flags & TRYAGAIN) {
10414             *flagp |= TRYAGAIN;
10415             return NULL;
10416         }
10417         vFAIL("Internal urp");
10418                                 /* Supposed to be caught earlier. */
10419         break;
10420     case '{':
10421         if (!regcurly(RExC_parse, FALSE)) {
10422             RExC_parse++;
10423             goto defchar;
10424         }
10425         /* FALL THROUGH */
10426     case '?':
10427     case '+':
10428     case '*':
10429         RExC_parse++;
10430         vFAIL("Quantifier follows nothing");
10431         break;
10432     case '\\':
10433         /* Special Escapes
10434
10435            This switch handles escape sequences that resolve to some kind
10436            of special regop and not to literal text. Escape sequnces that
10437            resolve to literal text are handled below in the switch marked
10438            "Literal Escapes".
10439
10440            Every entry in this switch *must* have a corresponding entry
10441            in the literal escape switch. However, the opposite is not
10442            required, as the default for this switch is to jump to the
10443            literal text handling code.
10444         */
10445         switch ((U8)*++RExC_parse) {
10446             U8 arg;
10447         /* Special Escapes */
10448         case 'A':
10449             RExC_seen_zerolen++;
10450             ret = reg_node(pRExC_state, SBOL);
10451             *flagp |= SIMPLE;
10452             goto finish_meta_pat;
10453         case 'G':
10454             ret = reg_node(pRExC_state, GPOS);
10455             RExC_seen |= REG_SEEN_GPOS;
10456             *flagp |= SIMPLE;
10457             goto finish_meta_pat;
10458         case 'K':
10459             RExC_seen_zerolen++;
10460             ret = reg_node(pRExC_state, KEEPS);
10461             *flagp |= SIMPLE;
10462             /* XXX:dmq : disabling in-place substitution seems to
10463              * be necessary here to avoid cases of memory corruption, as
10464              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10465              */
10466             RExC_seen |= REG_SEEN_LOOKBEHIND;
10467             goto finish_meta_pat;
10468         case 'Z':
10469             ret = reg_node(pRExC_state, SEOL);
10470             *flagp |= SIMPLE;
10471             RExC_seen_zerolen++;                /* Do not optimize RE away */
10472             goto finish_meta_pat;
10473         case 'z':
10474             ret = reg_node(pRExC_state, EOS);
10475             *flagp |= SIMPLE;
10476             RExC_seen_zerolen++;                /* Do not optimize RE away */
10477             goto finish_meta_pat;
10478         case 'C':
10479             ret = reg_node(pRExC_state, CANY);
10480             RExC_seen |= REG_SEEN_CANY;
10481             *flagp |= HASWIDTH|SIMPLE;
10482             goto finish_meta_pat;
10483         case 'X':
10484             ret = reg_node(pRExC_state, CLUMP);
10485             *flagp |= HASWIDTH;
10486             goto finish_meta_pat;
10487
10488         case 'W':
10489             invert = 1;
10490             /* FALLTHROUGH */
10491         case 'w':
10492             arg = ANYOF_WORDCHAR;
10493             goto join_posix;
10494
10495         case 'b':
10496             RExC_seen_zerolen++;
10497             RExC_seen |= REG_SEEN_LOOKBEHIND;
10498             op = BOUND + get_regex_charset(RExC_flags);
10499             if (op > BOUNDA) {  /* /aa is same as /a */
10500                 op = BOUNDA;
10501             }
10502             ret = reg_node(pRExC_state, op);
10503             FLAGS(ret) = get_regex_charset(RExC_flags);
10504             *flagp |= SIMPLE;
10505             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10506                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10507             }
10508             goto finish_meta_pat;
10509         case 'B':
10510             RExC_seen_zerolen++;
10511             RExC_seen |= REG_SEEN_LOOKBEHIND;
10512             op = NBOUND + get_regex_charset(RExC_flags);
10513             if (op > NBOUNDA) { /* /aa is same as /a */
10514                 op = NBOUNDA;
10515             }
10516             ret = reg_node(pRExC_state, op);
10517             FLAGS(ret) = get_regex_charset(RExC_flags);
10518             *flagp |= SIMPLE;
10519             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10520                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10521             }
10522             goto finish_meta_pat;
10523
10524         case 'D':
10525             invert = 1;
10526             /* FALLTHROUGH */
10527         case 'd':
10528             arg = ANYOF_DIGIT;
10529             goto join_posix;
10530
10531         case 'R':
10532             ret = reg_node(pRExC_state, LNBREAK);
10533             *flagp |= HASWIDTH|SIMPLE;
10534             goto finish_meta_pat;
10535
10536         case 'H':
10537             invert = 1;
10538             /* FALLTHROUGH */
10539         case 'h':
10540             arg = ANYOF_BLANK;
10541             op = POSIXU;
10542             goto join_posix_op_known;
10543
10544         case 'V':
10545             invert = 1;
10546             /* FALLTHROUGH */
10547         case 'v':
10548             arg = ANYOF_VERTWS;
10549             op = POSIXU;
10550             goto join_posix_op_known;
10551
10552         case 'S':
10553             invert = 1;
10554             /* FALLTHROUGH */
10555         case 's':
10556             arg = ANYOF_SPACE;
10557
10558         join_posix:
10559
10560             op = POSIXD + get_regex_charset(RExC_flags);
10561             if (op > POSIXA) {  /* /aa is same as /a */
10562                 op = POSIXA;
10563             }
10564
10565         join_posix_op_known:
10566
10567             if (invert) {
10568                 op += NPOSIXD - POSIXD;
10569             }
10570
10571             ret = reg_node(pRExC_state, op);
10572             if (! SIZE_ONLY) {
10573                 FLAGS(ret) = namedclass_to_classnum(arg);
10574             }
10575
10576             *flagp |= HASWIDTH|SIMPLE;
10577             /* FALL THROUGH */
10578
10579          finish_meta_pat:
10580             nextchar(pRExC_state);
10581             Set_Node_Length(ret, 2); /* MJD */
10582             break;
10583         case 'p':
10584         case 'P':
10585             {
10586 #ifdef DEBUGGING
10587                 char* parse_start = RExC_parse - 2;
10588 #endif
10589
10590                 RExC_parse--;
10591
10592                 ret = regclass(pRExC_state, flagp,depth+1,
10593                                TRUE, /* means just parse this element */
10594                                FALSE, /* don't allow multi-char folds */
10595                                FALSE, /* don't silence non-portable warnings.
10596                                          It would be a bug if these returned
10597                                          non-portables */
10598                                NULL);
10599                 /* regclass() can only return RESTART_UTF8 if multi-char folds
10600                    are allowed.  */
10601                 if (!ret)
10602                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10603                           (UV) *flagp);
10604
10605                 RExC_parse--;
10606
10607                 Set_Node_Offset(ret, parse_start + 2);
10608                 Set_Node_Cur_Length(ret, parse_start);
10609                 nextchar(pRExC_state);
10610             }
10611             break;
10612         case 'N':
10613             /* Handle \N and \N{NAME} with multiple code points here and not
10614              * below because it can be multicharacter. join_exact() will join
10615              * them up later on.  Also this makes sure that things like
10616              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10617              * The options to the grok function call causes it to fail if the
10618              * sequence is just a single code point.  We then go treat it as
10619              * just another character in the current EXACT node, and hence it
10620              * gets uniform treatment with all the other characters.  The
10621              * special treatment for quantifiers is not needed for such single
10622              * character sequences */
10623             ++RExC_parse;
10624             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10625                                 FALSE /* not strict */ )) {
10626                 if (*flagp & RESTART_UTF8)
10627                     return NULL;
10628                 RExC_parse--;
10629                 goto defchar;
10630             }
10631             break;
10632         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10633         parse_named_seq:
10634         {
10635             char ch= RExC_parse[1];
10636             if (ch != '<' && ch != '\'' && ch != '{') {
10637                 RExC_parse++;
10638                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10639             } else {
10640                 /* this pretty much dupes the code for (?P=...) in reg(), if
10641                    you change this make sure you change that */
10642                 char* name_start = (RExC_parse += 2);
10643                 U32 num = 0;
10644                 SV *sv_dat = reg_scan_name(pRExC_state,
10645                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10646                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10647                 if (RExC_parse == name_start || *RExC_parse != ch)
10648                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10649
10650                 if (!SIZE_ONLY) {
10651                     num = add_data( pRExC_state, 1, "S" );
10652                     RExC_rxi->data->data[num]=(void*)sv_dat;
10653                     SvREFCNT_inc_simple_void(sv_dat);
10654                 }
10655
10656                 RExC_sawback = 1;
10657                 ret = reganode(pRExC_state,
10658                                ((! FOLD)
10659                                  ? NREF
10660                                  : (ASCII_FOLD_RESTRICTED)
10661                                    ? NREFFA
10662                                    : (AT_LEAST_UNI_SEMANTICS)
10663                                      ? NREFFU
10664                                      : (LOC)
10665                                        ? NREFFL
10666                                        : NREFF),
10667                                 num);
10668                 *flagp |= HASWIDTH;
10669
10670                 /* override incorrect value set in reganode MJD */
10671                 Set_Node_Offset(ret, parse_start+1);
10672                 Set_Node_Cur_Length(ret, parse_start);
10673                 nextchar(pRExC_state);
10674
10675             }
10676             break;
10677         }
10678         case 'g':
10679         case '1': case '2': case '3': case '4':
10680         case '5': case '6': case '7': case '8': case '9':
10681             {
10682                 I32 num;
10683                 bool isg = *RExC_parse == 'g';
10684                 bool isrel = 0;
10685                 bool hasbrace = 0;
10686                 if (isg) {
10687                     RExC_parse++;
10688                     if (*RExC_parse == '{') {
10689                         RExC_parse++;
10690                         hasbrace = 1;
10691                     }
10692                     if (*RExC_parse == '-') {
10693                         RExC_parse++;
10694                         isrel = 1;
10695                     }
10696                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10697                         if (isrel) RExC_parse--;
10698                         RExC_parse -= 2;
10699                         goto parse_named_seq;
10700                 }   }
10701                 num = atoi(RExC_parse);
10702                 if (isg && num == 0) {
10703                     if (*RExC_parse == '0') {
10704                         vFAIL("Reference to invalid group 0");
10705                     }
10706                     else {
10707                         vFAIL("Unterminated \\g... pattern");
10708                     }
10709                 }
10710                 if (isrel) {
10711                     num = RExC_npar - num;
10712                     if (num < 1)
10713                         vFAIL("Reference to nonexistent or unclosed group");
10714                 }
10715                 if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10716                     /* Probably a character specified in octal, e.g. \35 */
10717                     goto defchar;
10718                 else {
10719 #ifdef RE_TRACK_PATTERN_OFFSETS
10720                     char * const parse_start = RExC_parse - 1; /* MJD */
10721 #endif
10722                     while (isDIGIT(*RExC_parse))
10723                         RExC_parse++;
10724                     if (hasbrace) {
10725                         if (*RExC_parse != '}')
10726                             vFAIL("Unterminated \\g{...} pattern");
10727                         RExC_parse++;
10728                     }
10729                     if (!SIZE_ONLY) {
10730                         if (num > (I32)RExC_rx->nparens)
10731                             vFAIL("Reference to nonexistent group");
10732                     }
10733                     RExC_sawback = 1;
10734                     ret = reganode(pRExC_state,
10735                                    ((! FOLD)
10736                                      ? REF
10737                                      : (ASCII_FOLD_RESTRICTED)
10738                                        ? REFFA
10739                                        : (AT_LEAST_UNI_SEMANTICS)
10740                                          ? REFFU
10741                                          : (LOC)
10742                                            ? REFFL
10743                                            : REFF),
10744                                     num);
10745                     *flagp |= HASWIDTH;
10746
10747                     /* override incorrect value set in reganode MJD */
10748                     Set_Node_Offset(ret, parse_start+1);
10749                     Set_Node_Cur_Length(ret, parse_start);
10750                     RExC_parse--;
10751                     nextchar(pRExC_state);
10752                 }
10753             }
10754             break;
10755         case '\0':
10756             if (RExC_parse >= RExC_end)
10757                 FAIL("Trailing \\");
10758             /* FALL THROUGH */
10759         default:
10760             /* Do not generate "unrecognized" warnings here, we fall
10761                back into the quick-grab loop below */
10762             parse_start--;
10763             goto defchar;
10764         }
10765         break;
10766
10767     case '#':
10768         if (RExC_flags & RXf_PMf_EXTENDED) {
10769             if ( reg_skipcomment( pRExC_state ) )
10770                 goto tryagain;
10771         }
10772         /* FALL THROUGH */
10773
10774     default:
10775
10776             parse_start = RExC_parse - 1;
10777
10778             RExC_parse++;
10779
10780         defchar: {
10781             STRLEN len = 0;
10782             UV ender = 0;
10783             char *p;
10784             char *s;
10785 #define MAX_NODE_STRING_SIZE 127
10786             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10787             char *s0;
10788             U8 upper_parse = MAX_NODE_STRING_SIZE;
10789             STRLEN foldlen;
10790             U8 node_type = compute_EXACTish(pRExC_state);
10791             bool next_is_quantifier;
10792             char * oldp = NULL;
10793
10794             /* We can convert EXACTF nodes to EXACTFU if they contain only
10795              * characters that match identically regardless of the target
10796              * string's UTF8ness.  The reason to do this is that EXACTF is not
10797              * trie-able, EXACTFU is.  (We don't need to figure this out until
10798              * pass 2) */
10799             bool maybe_exactfu = node_type == EXACTF && PASS2;
10800
10801             /* If a folding node contains only code points that don't
10802              * participate in folds, it can be changed into an EXACT node,
10803              * which allows the optimizer more things to look for */
10804             bool maybe_exact;
10805
10806             ret = reg_node(pRExC_state, node_type);
10807
10808             /* In pass1, folded, we use a temporary buffer instead of the
10809              * actual node, as the node doesn't exist yet */
10810             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10811
10812             s0 = s;
10813
10814         reparse:
10815
10816             /* We do the EXACTFish to EXACT node only if folding, and not if in
10817              * locale, as whether a character folds or not isn't known until
10818              * runtime.  (And we don't need to figure this out until pass 2) */
10819             maybe_exact = FOLD && ! LOC && PASS2;
10820
10821             /* XXX The node can hold up to 255 bytes, yet this only goes to
10822              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10823              * 255 allows us to not have to worry about overflow due to
10824              * converting to utf8 and fold expansion, but that value is
10825              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10826              * split up by this limit into a single one using the real max of
10827              * 255.  Even at 127, this breaks under rare circumstances.  If
10828              * folding, we do not want to split a node at a character that is a
10829              * non-final in a multi-char fold, as an input string could just
10830              * happen to want to match across the node boundary.  The join
10831              * would solve that problem if the join actually happens.  But a
10832              * series of more than two nodes in a row each of 127 would cause
10833              * the first join to succeed to get to 254, but then there wouldn't
10834              * be room for the next one, which could at be one of those split
10835              * multi-char folds.  I don't know of any fool-proof solution.  One
10836              * could back off to end with only a code point that isn't such a
10837              * non-final, but it is possible for there not to be any in the
10838              * entire node. */
10839             for (p = RExC_parse - 1;
10840                  len < upper_parse && p < RExC_end;
10841                  len++)
10842             {
10843                 oldp = p;
10844
10845                 if (RExC_flags & RXf_PMf_EXTENDED)
10846                     p = regwhite( pRExC_state, p );
10847                 switch ((U8)*p) {
10848                 case '^':
10849                 case '$':
10850                 case '.':
10851                 case '[':
10852                 case '(':
10853                 case ')':
10854                 case '|':
10855                     goto loopdone;
10856                 case '\\':
10857                     /* Literal Escapes Switch
10858
10859                        This switch is meant to handle escape sequences that
10860                        resolve to a literal character.
10861
10862                        Every escape sequence that represents something
10863                        else, like an assertion or a char class, is handled
10864                        in the switch marked 'Special Escapes' above in this
10865                        routine, but also has an entry here as anything that
10866                        isn't explicitly mentioned here will be treated as
10867                        an unescaped equivalent literal.
10868                     */
10869
10870                     switch ((U8)*++p) {
10871                     /* These are all the special escapes. */
10872                     case 'A':             /* Start assertion */
10873                     case 'b': case 'B':   /* Word-boundary assertion*/
10874                     case 'C':             /* Single char !DANGEROUS! */
10875                     case 'd': case 'D':   /* digit class */
10876                     case 'g': case 'G':   /* generic-backref, pos assertion */
10877                     case 'h': case 'H':   /* HORIZWS */
10878                     case 'k': case 'K':   /* named backref, keep marker */
10879                     case 'p': case 'P':   /* Unicode property */
10880                               case 'R':   /* LNBREAK */
10881                     case 's': case 'S':   /* space class */
10882                     case 'v': case 'V':   /* VERTWS */
10883                     case 'w': case 'W':   /* word class */
10884                     case 'X':             /* eXtended Unicode "combining character sequence" */
10885                     case 'z': case 'Z':   /* End of line/string assertion */
10886                         --p;
10887                         goto loopdone;
10888
10889                     /* Anything after here is an escape that resolves to a
10890                        literal. (Except digits, which may or may not)
10891                      */
10892                     case 'n':
10893                         ender = '\n';
10894                         p++;
10895                         break;
10896                     case 'N': /* Handle a single-code point named character. */
10897                         /* The options cause it to fail if a multiple code
10898                          * point sequence.  Handle those in the switch() above
10899                          * */
10900                         RExC_parse = p + 1;
10901                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10902                                             flagp, depth, FALSE,
10903                                             FALSE /* not strict */ ))
10904                         {
10905                             if (*flagp & RESTART_UTF8)
10906                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10907                             RExC_parse = p = oldp;
10908                             goto loopdone;
10909                         }
10910                         p = RExC_parse;
10911                         if (ender > 0xff) {
10912                             REQUIRE_UTF8;
10913                         }
10914                         break;
10915                     case 'r':
10916                         ender = '\r';
10917                         p++;
10918                         break;
10919                     case 't':
10920                         ender = '\t';
10921                         p++;
10922                         break;
10923                     case 'f':
10924                         ender = '\f';
10925                         p++;
10926                         break;
10927                     case 'e':
10928                           ender = ASCII_TO_NATIVE('\033');
10929                         p++;
10930                         break;
10931                     case 'a':
10932                           ender = '\a';
10933                         p++;
10934                         break;
10935                     case 'o':
10936                         {
10937                             UV result;
10938                             const char* error_msg;
10939
10940                             bool valid = grok_bslash_o(&p,
10941                                                        &result,
10942                                                        &error_msg,
10943                                                        TRUE, /* out warnings */
10944                                                        FALSE, /* not strict */
10945                                                        TRUE, /* Output warnings
10946                                                                 for non-
10947                                                                 portables */
10948                                                        UTF);
10949                             if (! valid) {
10950                                 RExC_parse = p; /* going to die anyway; point
10951                                                    to exact spot of failure */
10952                                 vFAIL(error_msg);
10953                             }
10954                             ender = result;
10955                             if (PL_encoding && ender < 0x100) {
10956                                 goto recode_encoding;
10957                             }
10958                             if (ender > 0xff) {
10959                                 REQUIRE_UTF8;
10960                             }
10961                             break;
10962                         }
10963                     case 'x':
10964                         {
10965                             UV result = UV_MAX; /* initialize to erroneous
10966                                                    value */
10967                             const char* error_msg;
10968
10969                             bool valid = grok_bslash_x(&p,
10970                                                        &result,
10971                                                        &error_msg,
10972                                                        TRUE, /* out warnings */
10973                                                        FALSE, /* not strict */
10974                                                        TRUE, /* Output warnings
10975                                                                 for non-
10976                                                                 portables */
10977                                                        UTF);
10978                             if (! valid) {
10979                                 RExC_parse = p; /* going to die anyway; point
10980                                                    to exact spot of failure */
10981                                 vFAIL(error_msg);
10982                             }
10983                             ender = result;
10984
10985                             if (PL_encoding && ender < 0x100) {
10986                                 goto recode_encoding;
10987                             }
10988                             if (ender > 0xff) {
10989                                 REQUIRE_UTF8;
10990                             }
10991                             break;
10992                         }
10993                     case 'c':
10994                         p++;
10995                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10996                         break;
10997                     case '8': case '9': /* must be a backreference */
10998                         --p;
10999                         goto loopdone;
11000                     case '1': case '2': case '3':case '4':
11001                     case '5': case '6': case '7':
11002                         /* When we parse backslash escapes there is ambiguity
11003                          * between backreferences and octal escapes. Any escape
11004                          * from \1 - \9 is a backreference, any multi-digit
11005                          * escape which does not start with 0 and which when
11006                          * evaluated as decimal could refer to an already
11007                          * parsed capture buffer is a backslash. Anything else
11008                          * is octal.
11009                          *
11010                          * Note this implies that \118 could be interpreted as
11011                          * 118 OR as "\11" . "8" depending on whether there
11012                          * were 118 capture buffers defined already in the
11013                          * pattern.  */
11014                         if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
11015                         {  /* Not to be treated as an octal constant, go
11016                                    find backref */
11017                             --p;
11018                             goto loopdone;
11019                         }
11020                     case '0':
11021                         {
11022                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11023                             STRLEN numlen = 3;
11024                             ender = grok_oct(p, &numlen, &flags, NULL);
11025                             if (ender > 0xff) {
11026                                 REQUIRE_UTF8;
11027                             }
11028                             p += numlen;
11029                             if (SIZE_ONLY   /* like \08, \178 */
11030                                 && numlen < 3
11031                                 && p < RExC_end
11032                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11033                             {
11034                                 reg_warn_non_literal_string(
11035                                          p + 1,
11036                                          form_short_octal_warning(p, numlen));
11037                             }
11038                         }
11039                         if (PL_encoding && ender < 0x100)
11040                             goto recode_encoding;
11041                         break;
11042                     recode_encoding:
11043                         if (! RExC_override_recoding) {
11044                             SV* enc = PL_encoding;
11045                             ender = reg_recode((const char)(U8)ender, &enc);
11046                             if (!enc && SIZE_ONLY)
11047                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11048                             REQUIRE_UTF8;
11049                         }
11050                         break;
11051                     case '\0':
11052                         if (p >= RExC_end)
11053                             FAIL("Trailing \\");
11054                         /* FALL THROUGH */
11055                     default:
11056                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11057                             /* Include any { following the alpha to emphasize
11058                              * that it could be part of an escape at some point
11059                              * in the future */
11060                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11061                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11062                         }
11063                         goto normal_default;
11064                     } /* End of switch on '\' */
11065                     break;
11066                 default:    /* A literal character */
11067
11068                     if (! SIZE_ONLY
11069                         && RExC_flags & RXf_PMf_EXTENDED
11070                         && ckWARN_d(WARN_DEPRECATED)
11071                         && is_PATWS_non_low(p, UTF))
11072                     {
11073                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11074                                 "Escape literal pattern white space under /x");
11075                     }
11076
11077                   normal_default:
11078                     if (UTF8_IS_START(*p) && UTF) {
11079                         STRLEN numlen;
11080                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11081                                                &numlen, UTF8_ALLOW_DEFAULT);
11082                         p += numlen;
11083                     }
11084                     else
11085                         ender = (U8) *p++;
11086                     break;
11087                 } /* End of switch on the literal */
11088
11089                 /* Here, have looked at the literal character and <ender>
11090                  * contains its ordinal, <p> points to the character after it
11091                  */
11092
11093                 if ( RExC_flags & RXf_PMf_EXTENDED)
11094                     p = regwhite( pRExC_state, p );
11095
11096                 /* If the next thing is a quantifier, it applies to this
11097                  * character only, which means that this character has to be in
11098                  * its own node and can't just be appended to the string in an
11099                  * existing node, so if there are already other characters in
11100                  * the node, close the node with just them, and set up to do
11101                  * this character again next time through, when it will be the
11102                  * only thing in its new node */
11103                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11104                 {
11105                     p = oldp;
11106                     goto loopdone;
11107                 }
11108
11109                 if (! FOLD) {
11110                     if (UTF) {
11111                         const STRLEN unilen = reguni(pRExC_state, ender, s);
11112                         if (unilen > 0) {
11113                            s   += unilen;
11114                            len += unilen;
11115                         }
11116
11117                         /* The loop increments <len> each time, as all but this
11118                          * path (and one other) through it add a single byte to
11119                          * the EXACTish node.  But this one has changed len to
11120                          * be the correct final value, so subtract one to
11121                          * cancel out the increment that follows */
11122                         len--;
11123                     }
11124                     else {
11125                         REGC((char)ender, s++);
11126                     }
11127                 }
11128                 else /* FOLD */ if (! ( UTF
11129                         /* See comments for join_exact() as to why we fold this
11130                          * non-UTF at compile time */
11131                         || (node_type == EXACTFU
11132                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
11133                 {
11134                     if (IS_IN_SOME_FOLD_L1(ender)) {
11135                         maybe_exact = FALSE;
11136
11137                         /* See if the character's fold differs between /d and
11138                          * /u.  This includes the multi-char fold SHARP S to
11139                          * 'ss' */
11140                         if (maybe_exactfu
11141                             && (PL_fold[ender] != PL_fold_latin1[ender]
11142                                 || ender == LATIN_SMALL_LETTER_SHARP_S
11143                                 || (len > 0
11144                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
11145                                    && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11146                         {
11147                             maybe_exactfu = FALSE;
11148                         }
11149                     }
11150                     *(s++) = (char) ender;
11151                 }
11152                 else {  /* UTF */
11153
11154                     /* Prime the casefolded buffer.  Locale rules, which apply
11155                      * only to code points < 256, aren't known until execution,
11156                      * so for them, just output the original character using
11157                      * utf8.  If we start to fold non-UTF patterns, be sure to
11158                      * update join_exact() */
11159                     if (LOC && ender < 256) {
11160                         if (NATIVE_IS_INVARIANT(ender)) {
11161                             *s = (U8) ender;
11162                             foldlen = 1;
11163                         } else {
11164                             *s = UTF8_TWO_BYTE_HI(ender);
11165                             *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11166                             foldlen = 2;
11167                         }
11168                     }
11169                     else {
11170                         UV folded = _to_uni_fold_flags(
11171                                        ender,
11172                                        (U8 *) s,
11173                                        &foldlen,
11174                                        FOLD_FLAGS_FULL
11175                                        | ((LOC) ?  FOLD_FLAGS_LOCALE
11176                                                 : (ASCII_FOLD_RESTRICTED)
11177                                                   ? FOLD_FLAGS_NOMIX_ASCII
11178                                                   : 0)
11179                                         );
11180
11181                         /* If this node only contains non-folding code points
11182                          * so far, see if this new one is also non-folding */
11183                         if (maybe_exact) {
11184                             if (folded != ender) {
11185                                 maybe_exact = FALSE;
11186                             }
11187                             else {
11188                                 /* Here the fold is the original; we have
11189                                  * to check further to see if anything
11190                                  * folds to it */
11191                                 if (! PL_utf8_foldable) {
11192                                     SV* swash = swash_init("utf8",
11193                                                        "_Perl_Any_Folds",
11194                                                        &PL_sv_undef, 1, 0);
11195                                     PL_utf8_foldable =
11196                                                 _get_swash_invlist(swash);
11197                                     SvREFCNT_dec_NN(swash);
11198                                 }
11199                                 if (_invlist_contains_cp(PL_utf8_foldable,
11200                                                          ender))
11201                                 {
11202                                     maybe_exact = FALSE;
11203                                 }
11204                             }
11205                         }
11206                         ender = folded;
11207                     }
11208                     s += foldlen;
11209
11210                     /* The loop increments <len> each time, as all but this
11211                      * path (and one other) through it add a single byte to the
11212                      * EXACTish node.  But this one has changed len to be the
11213                      * correct final value, so subtract one to cancel out the
11214                      * increment that follows */
11215                     len += foldlen - 1;
11216                 }
11217
11218                 if (next_is_quantifier) {
11219
11220                     /* Here, the next input is a quantifier, and to get here,
11221                      * the current character is the only one in the node.
11222                      * Also, here <len> doesn't include the final byte for this
11223                      * character */
11224                     len++;
11225                     goto loopdone;
11226                 }
11227
11228             } /* End of loop through literal characters */
11229
11230             /* Here we have either exhausted the input or ran out of room in
11231              * the node.  (If we encountered a character that can't be in the
11232              * node, transfer is made directly to <loopdone>, and so we
11233              * wouldn't have fallen off the end of the loop.)  In the latter
11234              * case, we artificially have to split the node into two, because
11235              * we just don't have enough space to hold everything.  This
11236              * creates a problem if the final character participates in a
11237              * multi-character fold in the non-final position, as a match that
11238              * should have occurred won't, due to the way nodes are matched,
11239              * and our artificial boundary.  So back off until we find a non-
11240              * problematic character -- one that isn't at the beginning or
11241              * middle of such a fold.  (Either it doesn't participate in any
11242              * folds, or appears only in the final position of all the folds it
11243              * does participate in.)  A better solution with far fewer false
11244              * positives, and that would fill the nodes more completely, would
11245              * be to actually have available all the multi-character folds to
11246              * test against, and to back-off only far enough to be sure that
11247              * this node isn't ending with a partial one.  <upper_parse> is set
11248              * further below (if we need to reparse the node) to include just
11249              * up through that final non-problematic character that this code
11250              * identifies, so when it is set to less than the full node, we can
11251              * skip the rest of this */
11252             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11253
11254                 const STRLEN full_len = len;
11255
11256                 assert(len >= MAX_NODE_STRING_SIZE);
11257
11258                 /* Here, <s> points to the final byte of the final character.
11259                  * Look backwards through the string until find a non-
11260                  * problematic character */
11261
11262                 if (! UTF) {
11263
11264                     /* These two have no multi-char folds to non-UTF characters
11265                      */
11266                     if (ASCII_FOLD_RESTRICTED || LOC) {
11267                         goto loopdone;
11268                     }
11269
11270                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11271                     len = s - s0 + 1;
11272                 }
11273                 else {
11274                     if (!  PL_NonL1NonFinalFold) {
11275                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11276                                         NonL1_Perl_Non_Final_Folds_invlist);
11277                     }
11278
11279                     /* Point to the first byte of the final character */
11280                     s = (char *) utf8_hop((U8 *) s, -1);
11281
11282                     while (s >= s0) {   /* Search backwards until find
11283                                            non-problematic char */
11284                         if (UTF8_IS_INVARIANT(*s)) {
11285
11286                             /* There are no ascii characters that participate
11287                              * in multi-char folds under /aa.  In EBCDIC, the
11288                              * non-ascii invariants are all control characters,
11289                              * so don't ever participate in any folds. */
11290                             if (ASCII_FOLD_RESTRICTED
11291                                 || ! IS_NON_FINAL_FOLD(*s))
11292                             {
11293                                 break;
11294                             }
11295                         }
11296                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11297
11298                             /* No Latin1 characters participate in multi-char
11299                              * folds under /l */
11300                             if (LOC
11301                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11302                                                                   *s, *(s+1))))
11303                             {
11304                                 break;
11305                             }
11306                         }
11307                         else if (! _invlist_contains_cp(
11308                                         PL_NonL1NonFinalFold,
11309                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11310                         {
11311                             break;
11312                         }
11313
11314                         /* Here, the current character is problematic in that
11315                          * it does occur in the non-final position of some
11316                          * fold, so try the character before it, but have to
11317                          * special case the very first byte in the string, so
11318                          * we don't read outside the string */
11319                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11320                     } /* End of loop backwards through the string */
11321
11322                     /* If there were only problematic characters in the string,
11323                      * <s> will point to before s0, in which case the length
11324                      * should be 0, otherwise include the length of the
11325                      * non-problematic character just found */
11326                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11327                 }
11328
11329                 /* Here, have found the final character, if any, that is
11330                  * non-problematic as far as ending the node without splitting
11331                  * it across a potential multi-char fold.  <len> contains the
11332                  * number of bytes in the node up-to and including that
11333                  * character, or is 0 if there is no such character, meaning
11334                  * the whole node contains only problematic characters.  In
11335                  * this case, give up and just take the node as-is.  We can't
11336                  * do any better */
11337                 if (len == 0) {
11338                     len = full_len;
11339
11340                     /* If the node ends in an 's' we make sure it stays EXACTF,
11341                      * as if it turns into an EXACTFU, it could later get
11342                      * joined with another 's' that would then wrongly match
11343                      * the sharp s */
11344                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11345                     {
11346                         maybe_exactfu = FALSE;
11347                     }
11348                 } else {
11349
11350                     /* Here, the node does contain some characters that aren't
11351                      * problematic.  If one such is the final character in the
11352                      * node, we are done */
11353                     if (len == full_len) {
11354                         goto loopdone;
11355                     }
11356                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11357
11358                         /* If the final character is problematic, but the
11359                          * penultimate is not, back-off that last character to
11360                          * later start a new node with it */
11361                         p = oldp;
11362                         goto loopdone;
11363                     }
11364
11365                     /* Here, the final non-problematic character is earlier
11366                      * in the input than the penultimate character.  What we do
11367                      * is reparse from the beginning, going up only as far as
11368                      * this final ok one, thus guaranteeing that the node ends
11369                      * in an acceptable character.  The reason we reparse is
11370                      * that we know how far in the character is, but we don't
11371                      * know how to correlate its position with the input parse.
11372                      * An alternate implementation would be to build that
11373                      * correlation as we go along during the original parse,
11374                      * but that would entail extra work for every node, whereas
11375                      * this code gets executed only when the string is too
11376                      * large for the node, and the final two characters are
11377                      * problematic, an infrequent occurrence.  Yet another
11378                      * possible strategy would be to save the tail of the
11379                      * string, and the next time regatom is called, initialize
11380                      * with that.  The problem with this is that unless you
11381                      * back off one more character, you won't be guaranteed
11382                      * regatom will get called again, unless regbranch,
11383                      * regpiece ... are also changed.  If you do back off that
11384                      * extra character, so that there is input guaranteed to
11385                      * force calling regatom, you can't handle the case where
11386                      * just the first character in the node is acceptable.  I
11387                      * (khw) decided to try this method which doesn't have that
11388                      * pitfall; if performance issues are found, we can do a
11389                      * combination of the current approach plus that one */
11390                     upper_parse = len;
11391                     len = 0;
11392                     s = s0;
11393                     goto reparse;
11394                 }
11395             }   /* End of verifying node ends with an appropriate char */
11396
11397         loopdone:   /* Jumped to when encounters something that shouldn't be in
11398                        the node */
11399
11400             /* I (khw) don't know if you can get here with zero length, but the
11401              * old code handled this situation by creating a zero-length EXACT
11402              * node.  Might as well be NOTHING instead */
11403             if (len == 0) {
11404                 OP(ret) = NOTHING;
11405             }
11406             else {
11407                 if (FOLD) {
11408                     /* If 'maybe_exact' is still set here, means there are no
11409                      * code points in the node that participate in folds;
11410                      * similarly for 'maybe_exactfu' and code points that match
11411                      * differently depending on UTF8ness of the target string
11412                      * */
11413                     if (maybe_exact) {
11414                         OP(ret) = EXACT;
11415                     }
11416                     else if (maybe_exactfu) {
11417                         OP(ret) = EXACTFU;
11418                     }
11419                 }
11420                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11421             }
11422
11423             RExC_parse = p - 1;
11424             Set_Node_Cur_Length(ret, parse_start);
11425             nextchar(pRExC_state);
11426             {
11427                 /* len is STRLEN which is unsigned, need to copy to signed */
11428                 IV iv = len;
11429                 if (iv < 0)
11430                     vFAIL("Internal disaster");
11431             }
11432
11433         } /* End of label 'defchar:' */
11434         break;
11435     } /* End of giant switch on input character */
11436
11437     return(ret);
11438 }
11439
11440 STATIC char *
11441 S_regwhite( RExC_state_t *pRExC_state, char *p )
11442 {
11443     const char *e = RExC_end;
11444
11445     PERL_ARGS_ASSERT_REGWHITE;
11446
11447     while (p < e) {
11448         if (isSPACE(*p))
11449             ++p;
11450         else if (*p == '#') {
11451             bool ended = 0;
11452             do {
11453                 if (*p++ == '\n') {
11454                     ended = 1;
11455                     break;
11456                 }
11457             } while (p < e);
11458             if (!ended)
11459                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11460         }
11461         else
11462             break;
11463     }
11464     return p;
11465 }
11466
11467 STATIC char *
11468 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11469 {
11470     /* Returns the next non-pattern-white space, non-comment character (the
11471      * latter only if 'recognize_comment is true) in the string p, which is
11472      * ended by RExC_end.  If there is no line break ending a comment,
11473      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11474     const char *e = RExC_end;
11475
11476     PERL_ARGS_ASSERT_REGPATWS;
11477
11478     while (p < e) {
11479         STRLEN len;
11480         if ((len = is_PATWS_safe(p, e, UTF))) {
11481             p += len;
11482         }
11483         else if (recognize_comment && *p == '#') {
11484             bool ended = 0;
11485             do {
11486                 p++;
11487                 if (is_LNBREAK_safe(p, e, UTF)) {
11488                     ended = 1;
11489                     break;
11490                 }
11491             } while (p < e);
11492             if (!ended)
11493                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11494         }
11495         else
11496             break;
11497     }
11498     return p;
11499 }
11500
11501 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11502    Character classes ([:foo:]) can also be negated ([:^foo:]).
11503    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11504    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11505    but trigger failures because they are currently unimplemented. */
11506
11507 #define POSIXCC_DONE(c)   ((c) == ':')
11508 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11509 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11510
11511 PERL_STATIC_INLINE I32
11512 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11513 {
11514     dVAR;
11515     I32 namedclass = OOB_NAMEDCLASS;
11516
11517     PERL_ARGS_ASSERT_REGPPOSIXCC;
11518
11519     if (value == '[' && RExC_parse + 1 < RExC_end &&
11520         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11521         POSIXCC(UCHARAT(RExC_parse)))
11522     {
11523         const char c = UCHARAT(RExC_parse);
11524         char* const s = RExC_parse++;
11525
11526         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11527             RExC_parse++;
11528         if (RExC_parse == RExC_end) {
11529             if (strict) {
11530
11531                 /* Try to give a better location for the error (than the end of
11532                  * the string) by looking for the matching ']' */
11533                 RExC_parse = s;
11534                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11535                     RExC_parse++;
11536                 }
11537                 vFAIL2("Unmatched '%c' in POSIX class", c);
11538             }
11539             /* Grandfather lone [:, [=, [. */
11540             RExC_parse = s;
11541         }
11542         else {
11543             const char* const t = RExC_parse++; /* skip over the c */
11544             assert(*t == c);
11545
11546             if (UCHARAT(RExC_parse) == ']') {
11547                 const char *posixcc = s + 1;
11548                 RExC_parse++; /* skip over the ending ] */
11549
11550                 if (*s == ':') {
11551                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11552                     const I32 skip = t - posixcc;
11553
11554                     /* Initially switch on the length of the name.  */
11555                     switch (skip) {
11556                     case 4:
11557                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11558                                                           this is the Perl \w
11559                                                         */
11560                             namedclass = ANYOF_WORDCHAR;
11561                         break;
11562                     case 5:
11563                         /* Names all of length 5.  */
11564                         /* alnum alpha ascii blank cntrl digit graph lower
11565                            print punct space upper  */
11566                         /* Offset 4 gives the best switch position.  */
11567                         switch (posixcc[4]) {
11568                         case 'a':
11569                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11570                                 namedclass = ANYOF_ALPHA;
11571                             break;
11572                         case 'e':
11573                             if (memEQ(posixcc, "spac", 4)) /* space */
11574                                 namedclass = ANYOF_PSXSPC;
11575                             break;
11576                         case 'h':
11577                             if (memEQ(posixcc, "grap", 4)) /* graph */
11578                                 namedclass = ANYOF_GRAPH;
11579                             break;
11580                         case 'i':
11581                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11582                                 namedclass = ANYOF_ASCII;
11583                             break;
11584                         case 'k':
11585                             if (memEQ(posixcc, "blan", 4)) /* blank */
11586                                 namedclass = ANYOF_BLANK;
11587                             break;
11588                         case 'l':
11589                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11590                                 namedclass = ANYOF_CNTRL;
11591                             break;
11592                         case 'm':
11593                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11594                                 namedclass = ANYOF_ALPHANUMERIC;
11595                             break;
11596                         case 'r':
11597                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11598                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11599                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11600                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11601                             break;
11602                         case 't':
11603                             if (memEQ(posixcc, "digi", 4)) /* digit */
11604                                 namedclass = ANYOF_DIGIT;
11605                             else if (memEQ(posixcc, "prin", 4)) /* print */
11606                                 namedclass = ANYOF_PRINT;
11607                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11608                                 namedclass = ANYOF_PUNCT;
11609                             break;
11610                         }
11611                         break;
11612                     case 6:
11613                         if (memEQ(posixcc, "xdigit", 6))
11614                             namedclass = ANYOF_XDIGIT;
11615                         break;
11616                     }
11617
11618                     if (namedclass == OOB_NAMEDCLASS)
11619                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11620                                       t - s - 1, s + 1);
11621
11622                     /* The #defines are structured so each complement is +1 to
11623                      * the normal one */
11624                     if (complement) {
11625                         namedclass++;
11626                     }
11627                     assert (posixcc[skip] == ':');
11628                     assert (posixcc[skip+1] == ']');
11629                 } else if (!SIZE_ONLY) {
11630                     /* [[=foo=]] and [[.foo.]] are still future. */
11631
11632                     /* adjust RExC_parse so the warning shows after
11633                        the class closes */
11634                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11635                         RExC_parse++;
11636                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11637                 }
11638             } else {
11639                 /* Maternal grandfather:
11640                  * "[:" ending in ":" but not in ":]" */
11641                 if (strict) {
11642                     vFAIL("Unmatched '[' in POSIX class");
11643                 }
11644
11645                 /* Grandfather lone [:, [=, [. */
11646                 RExC_parse = s;
11647             }
11648         }
11649     }
11650
11651     return namedclass;
11652 }
11653
11654 STATIC bool
11655 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11656 {
11657     /* This applies some heuristics at the current parse position (which should
11658      * be at a '[') to see if what follows might be intended to be a [:posix:]
11659      * class.  It returns true if it really is a posix class, of course, but it
11660      * also can return true if it thinks that what was intended was a posix
11661      * class that didn't quite make it.
11662      *
11663      * It will return true for
11664      *      [:alphanumerics:
11665      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11666      *                         ')' indicating the end of the (?[
11667      *      [:any garbage including %^&$ punctuation:]
11668      *
11669      * This is designed to be called only from S_handle_regex_sets; it could be
11670      * easily adapted to be called from the spot at the beginning of regclass()
11671      * that checks to see in a normal bracketed class if the surrounding []
11672      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11673      * change long-standing behavior, so I (khw) didn't do that */
11674     char* p = RExC_parse + 1;
11675     char first_char = *p;
11676
11677     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11678
11679     assert(*(p - 1) == '[');
11680
11681     if (! POSIXCC(first_char)) {
11682         return FALSE;
11683     }
11684
11685     p++;
11686     while (p < RExC_end && isWORDCHAR(*p)) p++;
11687
11688     if (p >= RExC_end) {
11689         return FALSE;
11690     }
11691
11692     if (p - RExC_parse > 2    /* Got at least 1 word character */
11693         && (*p == first_char
11694             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11695     {
11696         return TRUE;
11697     }
11698
11699     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11700
11701     return (p
11702             && p - RExC_parse > 2 /* [:] evaluates to colon;
11703                                       [::] is a bad posix class. */
11704             && first_char == *(p - 1));
11705 }
11706
11707 STATIC regnode *
11708 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11709                    char * const oregcomp_parse)
11710 {
11711     /* Handle the (?[...]) construct to do set operations */
11712
11713     U8 curchar;
11714     UV start, end;      /* End points of code point ranges */
11715     SV* result_string;
11716     char *save_end, *save_parse;
11717     SV* final;
11718     STRLEN len;
11719     regnode* node;
11720     AV* stack;
11721     const bool save_fold = FOLD;
11722
11723     GET_RE_DEBUG_FLAGS_DECL;
11724
11725     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11726
11727     if (LOC) {
11728         vFAIL("(?[...]) not valid in locale");
11729     }
11730     RExC_uni_semantics = 1;
11731
11732     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11733      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11734      * call regclass to handle '[]' so as to not have to reinvent its parsing
11735      * rules here (throwing away the size it computes each time).  And, we exit
11736      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11737      * these things, we need to realize that something preceded by a backslash
11738      * is escaped, so we have to keep track of backslashes */
11739     if (SIZE_ONLY) {
11740         UV depth = 0; /* how many nested (?[...]) constructs */
11741
11742         Perl_ck_warner_d(aTHX_
11743             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11744             "The regex_sets feature is experimental" REPORT_LOCATION,
11745             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11746
11747         while (RExC_parse < RExC_end) {
11748             SV* current = NULL;
11749             RExC_parse = regpatws(pRExC_state, RExC_parse,
11750                                 TRUE); /* means recognize comments */
11751             switch (*RExC_parse) {
11752                 case '?':
11753                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
11754                     /* FALL THROUGH */
11755                 default:
11756                     break;
11757                 case '\\':
11758                     /* Skip the next byte (which could cause us to end up in
11759                      * the middle of a UTF-8 character, but since none of those
11760                      * are confusable with anything we currently handle in this
11761                      * switch (invariants all), it's safe.  We'll just hit the
11762                      * default: case next time and keep on incrementing until
11763                      * we find one of the invariants we do handle. */
11764                     RExC_parse++;
11765                     break;
11766                 case '[':
11767                 {
11768                     /* If this looks like it is a [:posix:] class, leave the
11769                      * parse pointer at the '[' to fool regclass() into
11770                      * thinking it is part of a '[[:posix:]]'.  That function
11771                      * will use strict checking to force a syntax error if it
11772                      * doesn't work out to a legitimate class */
11773                     bool is_posix_class
11774                                     = could_it_be_a_POSIX_class(pRExC_state);
11775                     if (! is_posix_class) {
11776                         RExC_parse++;
11777                     }
11778
11779                     /* regclass() can only return RESTART_UTF8 if multi-char
11780                        folds are allowed.  */
11781                     if (!regclass(pRExC_state, flagp,depth+1,
11782                                   is_posix_class, /* parse the whole char
11783                                                      class only if not a
11784                                                      posix class */
11785                                   FALSE, /* don't allow multi-char folds */
11786                                   TRUE, /* silence non-portable warnings. */
11787                                   &current))
11788                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11789                               (UV) *flagp);
11790
11791                     /* function call leaves parse pointing to the ']', except
11792                      * if we faked it */
11793                     if (is_posix_class) {
11794                         RExC_parse--;
11795                     }
11796
11797                     SvREFCNT_dec(current);   /* In case it returned something */
11798                     break;
11799                 }
11800
11801                 case ']':
11802                     if (depth--) break;
11803                     RExC_parse++;
11804                     if (RExC_parse < RExC_end
11805                         && *RExC_parse == ')')
11806                     {
11807                         node = reganode(pRExC_state, ANYOF, 0);
11808                         RExC_size += ANYOF_SKIP;
11809                         nextchar(pRExC_state);
11810                         Set_Node_Length(node,
11811                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11812                         return node;
11813                     }
11814                     goto no_close;
11815             }
11816             RExC_parse++;
11817         }
11818
11819         no_close:
11820         FAIL("Syntax error in (?[...])");
11821     }
11822
11823     /* Pass 2 only after this.  Everything in this construct is a
11824      * metacharacter.  Operands begin with either a '\' (for an escape
11825      * sequence), or a '[' for a bracketed character class.  Any other
11826      * character should be an operator, or parenthesis for grouping.  Both
11827      * types of operands are handled by calling regclass() to parse them.  It
11828      * is called with a parameter to indicate to return the computed inversion
11829      * list.  The parsing here is implemented via a stack.  Each entry on the
11830      * stack is a single character representing one of the operators, or the
11831      * '('; or else a pointer to an operand inversion list. */
11832
11833 #define IS_OPERAND(a)  (! SvIOK(a))
11834
11835     /* The stack starts empty.  It is a syntax error if the first thing parsed
11836      * is a binary operator; everything else is pushed on the stack.  When an
11837      * operand is parsed, the top of the stack is examined.  If it is a binary
11838      * operator, the item before it should be an operand, and both are replaced
11839      * by the result of doing that operation on the new operand and the one on
11840      * the stack.   Thus a sequence of binary operands is reduced to a single
11841      * one before the next one is parsed.
11842      *
11843      * A unary operator may immediately follow a binary in the input, for
11844      * example
11845      *      [a] + ! [b]
11846      * When an operand is parsed and the top of the stack is a unary operator,
11847      * the operation is performed, and then the stack is rechecked to see if
11848      * this new operand is part of a binary operation; if so, it is handled as
11849      * above.
11850      *
11851      * A '(' is simply pushed on the stack; it is valid only if the stack is
11852      * empty, or the top element of the stack is an operator or another '('
11853      * (for which the parenthesized expression will become an operand).  By the
11854      * time the corresponding ')' is parsed everything in between should have
11855      * been parsed and evaluated to a single operand (or else is a syntax
11856      * error), and is handled as a regular operand */
11857
11858     sv_2mortal((SV *)(stack = newAV()));
11859
11860     while (RExC_parse < RExC_end) {
11861         I32 top_index = av_tindex(stack);
11862         SV** top_ptr;
11863         SV* current = NULL;
11864
11865         /* Skip white space */
11866         RExC_parse = regpatws(pRExC_state, RExC_parse,
11867                                 TRUE); /* means recognize comments */
11868         if (RExC_parse >= RExC_end) {
11869             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11870         }
11871         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11872             break;
11873         }
11874
11875         switch (curchar) {
11876
11877             case '?':
11878                 if (av_tindex(stack) >= 0   /* This makes sure that we can
11879                                                safely subtract 1 from
11880                                                RExC_parse in the next clause.
11881                                                If we have something on the
11882                                                stack, we have parsed something
11883                                              */
11884                     && UCHARAT(RExC_parse - 1) == '('
11885                     && RExC_parse < RExC_end)
11886                 {
11887                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11888                      * This happens when we have some thing like
11889                      *
11890                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11891                      *   ...
11892                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11893                      *
11894                      * Here we would be handling the interpolated
11895                      * '$thai_or_lao'.  We handle this by a recursive call to
11896                      * ourselves which returns the inversion list the
11897                      * interpolated expression evaluates to.  We use the flags
11898                      * from the interpolated pattern. */
11899                     U32 save_flags = RExC_flags;
11900                     const char * const save_parse = ++RExC_parse;
11901
11902                     parse_lparen_question_flags(pRExC_state);
11903
11904                     if (RExC_parse == save_parse  /* Makes sure there was at
11905                                                      least one flag (or this
11906                                                      embedding wasn't compiled)
11907                                                    */
11908                         || RExC_parse >= RExC_end - 4
11909                         || UCHARAT(RExC_parse) != ':'
11910                         || UCHARAT(++RExC_parse) != '('
11911                         || UCHARAT(++RExC_parse) != '?'
11912                         || UCHARAT(++RExC_parse) != '[')
11913                     {
11914
11915                         /* In combination with the above, this moves the
11916                          * pointer to the point just after the first erroneous
11917                          * character (or if there are no flags, to where they
11918                          * should have been) */
11919                         if (RExC_parse >= RExC_end - 4) {
11920                             RExC_parse = RExC_end;
11921                         }
11922                         else if (RExC_parse != save_parse) {
11923                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11924                         }
11925                         vFAIL("Expecting '(?flags:(?[...'");
11926                     }
11927                     RExC_parse++;
11928                     (void) handle_regex_sets(pRExC_state, &current, flagp,
11929                                                     depth+1, oregcomp_parse);
11930
11931                     /* Here, 'current' contains the embedded expression's
11932                      * inversion list, and RExC_parse points to the trailing
11933                      * ']'; the next character should be the ')' which will be
11934                      * paired with the '(' that has been put on the stack, so
11935                      * the whole embedded expression reduces to '(operand)' */
11936                     RExC_parse++;
11937
11938                     RExC_flags = save_flags;
11939                     goto handle_operand;
11940                 }
11941                 /* FALL THROUGH */
11942
11943             default:
11944                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11945                 vFAIL("Unexpected character");
11946
11947             case '\\':
11948                 /* regclass() can only return RESTART_UTF8 if multi-char
11949                    folds are allowed.  */
11950                 if (!regclass(pRExC_state, flagp,depth+1,
11951                               TRUE, /* means parse just the next thing */
11952                               FALSE, /* don't allow multi-char folds */
11953                               FALSE, /* don't silence non-portable warnings.  */
11954                               &current))
11955                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11956                           (UV) *flagp);
11957                 /* regclass() will return with parsing just the \ sequence,
11958                  * leaving the parse pointer at the next thing to parse */
11959                 RExC_parse--;
11960                 goto handle_operand;
11961
11962             case '[':   /* Is a bracketed character class */
11963             {
11964                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11965
11966                 if (! is_posix_class) {
11967                     RExC_parse++;
11968                 }
11969
11970                 /* regclass() can only return RESTART_UTF8 if multi-char
11971                    folds are allowed.  */
11972                 if(!regclass(pRExC_state, flagp,depth+1,
11973                              is_posix_class, /* parse the whole char class
11974                                                 only if not a posix class */
11975                              FALSE, /* don't allow multi-char folds */
11976                              FALSE, /* don't silence non-portable warnings.  */
11977                              &current))
11978                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11979                           (UV) *flagp);
11980                 /* function call leaves parse pointing to the ']', except if we
11981                  * faked it */
11982                 if (is_posix_class) {
11983                     RExC_parse--;
11984                 }
11985
11986                 goto handle_operand;
11987             }
11988
11989             case '&':
11990             case '|':
11991             case '+':
11992             case '-':
11993             case '^':
11994                 if (top_index < 0
11995                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11996                     || ! IS_OPERAND(*top_ptr))
11997                 {
11998                     RExC_parse++;
11999                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12000                 }
12001                 av_push(stack, newSVuv(curchar));
12002                 break;
12003
12004             case '!':
12005                 av_push(stack, newSVuv(curchar));
12006                 break;
12007
12008             case '(':
12009                 if (top_index >= 0) {
12010                     top_ptr = av_fetch(stack, top_index, FALSE);
12011                     assert(top_ptr);
12012                     if (IS_OPERAND(*top_ptr)) {
12013                         RExC_parse++;
12014                         vFAIL("Unexpected '(' with no preceding operator");
12015                     }
12016                 }
12017                 av_push(stack, newSVuv(curchar));
12018                 break;
12019
12020             case ')':
12021             {
12022                 SV* lparen;
12023                 if (top_index < 1
12024                     || ! (current = av_pop(stack))
12025                     || ! IS_OPERAND(current)
12026                     || ! (lparen = av_pop(stack))
12027                     || IS_OPERAND(lparen)
12028                     || SvUV(lparen) != '(')
12029                 {
12030                     SvREFCNT_dec(current);
12031                     RExC_parse++;
12032                     vFAIL("Unexpected ')'");
12033                 }
12034                 top_index -= 2;
12035                 SvREFCNT_dec_NN(lparen);
12036
12037                 /* FALL THROUGH */
12038             }
12039
12040               handle_operand:
12041
12042                 /* Here, we have an operand to process, in 'current' */
12043
12044                 if (top_index < 0) {    /* Just push if stack is empty */
12045                     av_push(stack, current);
12046                 }
12047                 else {
12048                     SV* top = av_pop(stack);
12049                     SV *prev = NULL;
12050                     char current_operator;
12051
12052                     if (IS_OPERAND(top)) {
12053                         SvREFCNT_dec_NN(top);
12054                         SvREFCNT_dec_NN(current);
12055                         vFAIL("Operand with no preceding operator");
12056                     }
12057                     current_operator = (char) SvUV(top);
12058                     switch (current_operator) {
12059                         case '(':   /* Push the '(' back on followed by the new
12060                                        operand */
12061                             av_push(stack, top);
12062                             av_push(stack, current);
12063                             SvREFCNT_inc(top);  /* Counters the '_dec' done
12064                                                    just after the 'break', so
12065                                                    it doesn't get wrongly freed
12066                                                  */
12067                             break;
12068
12069                         case '!':
12070                             _invlist_invert(current);
12071
12072                             /* Unlike binary operators, the top of the stack,
12073                              * now that this unary one has been popped off, may
12074                              * legally be an operator, and we now have operand
12075                              * for it. */
12076                             top_index--;
12077                             SvREFCNT_dec_NN(top);
12078                             goto handle_operand;
12079
12080                         case '&':
12081                             prev = av_pop(stack);
12082                             _invlist_intersection(prev,
12083                                                    current,
12084                                                    &current);
12085                             av_push(stack, current);
12086                             break;
12087
12088                         case '|':
12089                         case '+':
12090                             prev = av_pop(stack);
12091                             _invlist_union(prev, current, &current);
12092                             av_push(stack, current);
12093                             break;
12094
12095                         case '-':
12096                             prev = av_pop(stack);;
12097                             _invlist_subtract(prev, current, &current);
12098                             av_push(stack, current);
12099                             break;
12100
12101                         case '^':   /* The union minus the intersection */
12102                         {
12103                             SV* i = NULL;
12104                             SV* u = NULL;
12105                             SV* element;
12106
12107                             prev = av_pop(stack);
12108                             _invlist_union(prev, current, &u);
12109                             _invlist_intersection(prev, current, &i);
12110                             /* _invlist_subtract will overwrite current
12111                                 without freeing what it already contains */
12112                             element = current;
12113                             _invlist_subtract(u, i, &current);
12114                             av_push(stack, current);
12115                             SvREFCNT_dec_NN(i);
12116                             SvREFCNT_dec_NN(u);
12117                             SvREFCNT_dec_NN(element);
12118                             break;
12119                         }
12120
12121                         default:
12122                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12123                 }
12124                 SvREFCNT_dec_NN(top);
12125                 SvREFCNT_dec(prev);
12126             }
12127         }
12128
12129         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12130     }
12131
12132     if (av_tindex(stack) < 0   /* Was empty */
12133         || ((final = av_pop(stack)) == NULL)
12134         || ! IS_OPERAND(final)
12135         || av_tindex(stack) >= 0)  /* More left on stack */
12136     {
12137         vFAIL("Incomplete expression within '(?[ ])'");
12138     }
12139
12140     /* Here, 'final' is the resultant inversion list from evaluating the
12141      * expression.  Return it if so requested */
12142     if (return_invlist) {
12143         *return_invlist = final;
12144         return END;
12145     }
12146
12147     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12148      * expecting a string of ranges and individual code points */
12149     invlist_iterinit(final);
12150     result_string = newSVpvs("");
12151     while (invlist_iternext(final, &start, &end)) {
12152         if (start == end) {
12153             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12154         }
12155         else {
12156             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12157                                                      start,          end);
12158         }
12159     }
12160
12161     save_parse = RExC_parse;
12162     RExC_parse = SvPV(result_string, len);
12163     save_end = RExC_end;
12164     RExC_end = RExC_parse + len;
12165
12166     /* We turn off folding around the call, as the class we have constructed
12167      * already has all folding taken into consideration, and we don't want
12168      * regclass() to add to that */
12169     RExC_flags &= ~RXf_PMf_FOLD;
12170     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12171      */
12172     node = regclass(pRExC_state, flagp,depth+1,
12173                     FALSE, /* means parse the whole char class */
12174                     FALSE, /* don't allow multi-char folds */
12175                     TRUE, /* silence non-portable warnings.  The above may very
12176                              well have generated non-portable code points, but
12177                              they're valid on this machine */
12178                     NULL);
12179     if (!node)
12180         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12181                     PTR2UV(flagp));
12182     if (save_fold) {
12183         RExC_flags |= RXf_PMf_FOLD;
12184     }
12185     RExC_parse = save_parse + 1;
12186     RExC_end = save_end;
12187     SvREFCNT_dec_NN(final);
12188     SvREFCNT_dec_NN(result_string);
12189
12190     nextchar(pRExC_state);
12191     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12192     return node;
12193 }
12194 #undef IS_OPERAND
12195
12196 /* The names of properties whose definitions are not known at compile time are
12197  * stored in this SV, after a constant heading.  So if the length has been
12198  * changed since initialization, then there is a run-time definition. */
12199 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12200
12201 STATIC regnode *
12202 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12203                  const bool stop_at_1,  /* Just parse the next thing, don't
12204                                            look for a full character class */
12205                  bool allow_multi_folds,
12206                  const bool silence_non_portable,   /* Don't output warnings
12207                                                        about too large
12208                                                        characters */
12209                  SV** ret_invlist)  /* Return an inversion list, not a node */
12210 {
12211     /* parse a bracketed class specification.  Most of these will produce an
12212      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12213      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12214      * under /i with multi-character folds: it will be rewritten following the
12215      * paradigm of this example, where the <multi-fold>s are characters which
12216      * fold to multiple character sequences:
12217      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12218      * gets effectively rewritten as:
12219      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12220      * reg() gets called (recursively) on the rewritten version, and this
12221      * function will return what it constructs.  (Actually the <multi-fold>s
12222      * aren't physically removed from the [abcdefghi], it's just that they are
12223      * ignored in the recursion by means of a flag:
12224      * <RExC_in_multi_char_class>.)
12225      *
12226      * ANYOF nodes contain a bit map for the first 256 characters, with the
12227      * corresponding bit set if that character is in the list.  For characters
12228      * above 255, a range list or swash is used.  There are extra bits for \w,
12229      * etc. in locale ANYOFs, as what these match is not determinable at
12230      * compile time
12231      *
12232      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12233      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12234      */
12235
12236     dVAR;
12237     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12238     IV range = 0;
12239     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12240     regnode *ret;
12241     STRLEN numlen;
12242     IV namedclass = OOB_NAMEDCLASS;
12243     char *rangebegin = NULL;
12244     bool need_class = 0;
12245     SV *listsv = NULL;
12246     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12247                                       than just initialized.  */
12248     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12249     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12250                                extended beyond the Latin1 range */
12251     UV element_count = 0;   /* Number of distinct elements in the class.
12252                                Optimizations may be possible if this is tiny */
12253     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12254                                        character; used under /i */
12255     UV n;
12256     char * stop_ptr = RExC_end;    /* where to stop parsing */
12257     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12258                                                    space? */
12259     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12260
12261     /* Unicode properties are stored in a swash; this holds the current one
12262      * being parsed.  If this swash is the only above-latin1 component of the
12263      * character class, an optimization is to pass it directly on to the
12264      * execution engine.  Otherwise, it is set to NULL to indicate that there
12265      * are other things in the class that have to be dealt with at execution
12266      * time */
12267     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12268
12269     /* Set if a component of this character class is user-defined; just passed
12270      * on to the engine */
12271     bool has_user_defined_property = FALSE;
12272
12273     /* inversion list of code points this node matches only when the target
12274      * string is in UTF-8.  (Because is under /d) */
12275     SV* depends_list = NULL;
12276
12277     /* inversion list of code points this node matches.  For much of the
12278      * function, it includes only those that match regardless of the utf8ness
12279      * of the target string */
12280     SV* cp_list = NULL;
12281
12282 #ifdef EBCDIC
12283     /* In a range, counts how many 0-2 of the ends of it came from literals,
12284      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12285     UV literal_endpoint = 0;
12286 #endif
12287     bool invert = FALSE;    /* Is this class to be complemented */
12288
12289     /* Is there any thing like \W or [:^digit:] that matches above the legal
12290      * Unicode range? */
12291     bool runtime_posix_matches_above_Unicode = FALSE;
12292
12293     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12294         case we need to change the emitted regop to an EXACT. */
12295     const char * orig_parse = RExC_parse;
12296     const SSize_t orig_size = RExC_size;
12297     GET_RE_DEBUG_FLAGS_DECL;
12298
12299     PERL_ARGS_ASSERT_REGCLASS;
12300 #ifndef DEBUGGING
12301     PERL_UNUSED_ARG(depth);
12302 #endif
12303
12304     DEBUG_PARSE("clas");
12305
12306     /* Assume we are going to generate an ANYOF node. */
12307     ret = reganode(pRExC_state, ANYOF, 0);
12308
12309     if (SIZE_ONLY) {
12310         RExC_size += ANYOF_SKIP;
12311         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12312     }
12313     else {
12314         ANYOF_FLAGS(ret) = 0;
12315
12316         RExC_emit += ANYOF_SKIP;
12317         if (LOC) {
12318             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12319         }
12320         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12321         initial_listsv_len = SvCUR(listsv);
12322         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12323     }
12324
12325     if (skip_white) {
12326         RExC_parse = regpatws(pRExC_state, RExC_parse,
12327                               FALSE /* means don't recognize comments */);
12328     }
12329
12330     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12331         RExC_parse++;
12332         invert = TRUE;
12333         allow_multi_folds = FALSE;
12334         RExC_naughty++;
12335         if (skip_white) {
12336             RExC_parse = regpatws(pRExC_state, RExC_parse,
12337                                   FALSE /* means don't recognize comments */);
12338         }
12339     }
12340
12341     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12342     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12343         const char *s = RExC_parse;
12344         const char  c = *s++;
12345
12346         while (isWORDCHAR(*s))
12347             s++;
12348         if (*s && c == *s && s[1] == ']') {
12349             SAVEFREESV(RExC_rx_sv);
12350             ckWARN3reg(s+2,
12351                        "POSIX syntax [%c %c] belongs inside character classes",
12352                        c, c);
12353             (void)ReREFCNT_inc(RExC_rx_sv);
12354         }
12355     }
12356
12357     /* If the caller wants us to just parse a single element, accomplish this
12358      * by faking the loop ending condition */
12359     if (stop_at_1 && RExC_end > RExC_parse) {
12360         stop_ptr = RExC_parse + 1;
12361     }
12362
12363     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12364     if (UCHARAT(RExC_parse) == ']')
12365         goto charclassloop;
12366
12367 parseit:
12368     while (1) {
12369         if  (RExC_parse >= stop_ptr) {
12370             break;
12371         }
12372
12373         if (skip_white) {
12374             RExC_parse = regpatws(pRExC_state, RExC_parse,
12375                                   FALSE /* means don't recognize comments */);
12376         }
12377
12378         if  (UCHARAT(RExC_parse) == ']') {
12379             break;
12380         }
12381
12382     charclassloop:
12383
12384         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12385         save_value = value;
12386         save_prevvalue = prevvalue;
12387
12388         if (!range) {
12389             rangebegin = RExC_parse;
12390             element_count++;
12391         }
12392         if (UTF) {
12393             value = utf8n_to_uvchr((U8*)RExC_parse,
12394                                    RExC_end - RExC_parse,
12395                                    &numlen, UTF8_ALLOW_DEFAULT);
12396             RExC_parse += numlen;
12397         }
12398         else
12399             value = UCHARAT(RExC_parse++);
12400
12401         if (value == '['
12402             && RExC_parse < RExC_end
12403             && POSIXCC(UCHARAT(RExC_parse)))
12404         {
12405             namedclass = regpposixcc(pRExC_state, value, strict);
12406         }
12407         else if (value == '\\') {
12408             if (UTF) {
12409                 value = utf8n_to_uvchr((U8*)RExC_parse,
12410                                    RExC_end - RExC_parse,
12411                                    &numlen, UTF8_ALLOW_DEFAULT);
12412                 RExC_parse += numlen;
12413             }
12414             else
12415                 value = UCHARAT(RExC_parse++);
12416
12417             /* Some compilers cannot handle switching on 64-bit integer
12418              * values, therefore value cannot be an UV.  Yes, this will
12419              * be a problem later if we want switch on Unicode.
12420              * A similar issue a little bit later when switching on
12421              * namedclass. --jhi */
12422
12423             /* If the \ is escaping white space when white space is being
12424              * skipped, it means that that white space is wanted literally, and
12425              * is already in 'value'.  Otherwise, need to translate the escape
12426              * into what it signifies. */
12427             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12428
12429             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12430             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12431             case 's':   namedclass = ANYOF_SPACE;       break;
12432             case 'S':   namedclass = ANYOF_NSPACE;      break;
12433             case 'd':   namedclass = ANYOF_DIGIT;       break;
12434             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12435             case 'v':   namedclass = ANYOF_VERTWS;      break;
12436             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12437             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12438             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12439             case 'N':  /* Handle \N{NAME} in class */
12440                 {
12441                     /* We only pay attention to the first char of
12442                     multichar strings being returned. I kinda wonder
12443                     if this makes sense as it does change the behaviour
12444                     from earlier versions, OTOH that behaviour was broken
12445                     as well. */
12446                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12447                                       TRUE, /* => charclass */
12448                                       strict))
12449                     {
12450                         if (*flagp & RESTART_UTF8)
12451                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
12452                         goto parseit;
12453                     }
12454                 }
12455                 break;
12456             case 'p':
12457             case 'P':
12458                 {
12459                 char *e;
12460
12461                 /* We will handle any undefined properties ourselves */
12462                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12463
12464                 if (RExC_parse >= RExC_end)
12465                     vFAIL2("Empty \\%c{}", (U8)value);
12466                 if (*RExC_parse == '{') {
12467                     const U8 c = (U8)value;
12468                     e = strchr(RExC_parse++, '}');
12469                     if (!e)
12470                         vFAIL2("Missing right brace on \\%c{}", c);
12471                     while (isSPACE(UCHARAT(RExC_parse)))
12472                         RExC_parse++;
12473                     if (e == RExC_parse)
12474                         vFAIL2("Empty \\%c{}", c);
12475                     n = e - RExC_parse;
12476                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12477                         n--;
12478                 }
12479                 else {
12480                     e = RExC_parse;
12481                     n = 1;
12482                 }
12483                 if (!SIZE_ONLY) {
12484                     SV* invlist;
12485                     char* name;
12486
12487                     if (UCHARAT(RExC_parse) == '^') {
12488                          RExC_parse++;
12489                          n--;
12490                          /* toggle.  (The rhs xor gets the single bit that
12491                           * differs between P and p; the other xor inverts just
12492                           * that bit) */
12493                          value ^= 'P' ^ 'p';
12494
12495                          while (isSPACE(UCHARAT(RExC_parse))) {
12496                               RExC_parse++;
12497                               n--;
12498                          }
12499                     }
12500                     /* Try to get the definition of the property into
12501                      * <invlist>.  If /i is in effect, the effective property
12502                      * will have its name be <__NAME_i>.  The design is
12503                      * discussed in commit
12504                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12505                     Newx(name, n + sizeof("_i__\n"), char);
12506
12507                     sprintf(name, "%s%.*s%s\n",
12508                                     (FOLD) ? "__" : "",
12509                                     (int)n,
12510                                     RExC_parse,
12511                                     (FOLD) ? "_i" : ""
12512                     );
12513
12514                     /* Look up the property name, and get its swash and
12515                      * inversion list, if the property is found  */
12516                     if (swash) {
12517                         SvREFCNT_dec_NN(swash);
12518                     }
12519                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12520                                              1, /* binary */
12521                                              0, /* not tr/// */
12522                                              NULL, /* No inversion list */
12523                                              &swash_init_flags
12524                                             );
12525                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12526                         if (swash) {
12527                             SvREFCNT_dec_NN(swash);
12528                             swash = NULL;
12529                         }
12530
12531                         /* Here didn't find it.  It could be a user-defined
12532                          * property that will be available at run-time.  If we
12533                          * accept only compile-time properties, is an error;
12534                          * otherwise add it to the list for run-time look up */
12535                         if (ret_invlist) {
12536                             RExC_parse = e + 1;
12537                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12538                         }
12539                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12540                                         (value == 'p' ? '+' : '!'),
12541                                         name);
12542                         has_user_defined_property = TRUE;
12543
12544                         /* We don't know yet, so have to assume that the
12545                          * property could match something in the Latin1 range,
12546                          * hence something that isn't utf8.  Note that this
12547                          * would cause things in <depends_list> to match
12548                          * inappropriately, except that any \p{}, including
12549                          * this one forces Unicode semantics, which means there
12550                          * is <no depends_list> */
12551                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12552                     }
12553                     else {
12554
12555                         /* Here, did get the swash and its inversion list.  If
12556                          * the swash is from a user-defined property, then this
12557                          * whole character class should be regarded as such */
12558                         has_user_defined_property =
12559                                     (swash_init_flags
12560                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12561
12562                         /* Invert if asking for the complement */
12563                         if (value == 'P') {
12564                             _invlist_union_complement_2nd(properties,
12565                                                           invlist,
12566                                                           &properties);
12567
12568                             /* The swash can't be used as-is, because we've
12569                              * inverted things; delay removing it to here after
12570                              * have copied its invlist above */
12571                             SvREFCNT_dec_NN(swash);
12572                             swash = NULL;
12573                         }
12574                         else {
12575                             _invlist_union(properties, invlist, &properties);
12576                         }
12577                     }
12578                     Safefree(name);
12579                 }
12580                 RExC_parse = e + 1;
12581                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12582                                                 named */
12583
12584                 /* \p means they want Unicode semantics */
12585                 RExC_uni_semantics = 1;
12586                 }
12587                 break;
12588             case 'n':   value = '\n';                   break;
12589             case 'r':   value = '\r';                   break;
12590             case 't':   value = '\t';                   break;
12591             case 'f':   value = '\f';                   break;
12592             case 'b':   value = '\b';                   break;
12593             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12594             case 'a':   value = '\a';                   break;
12595             case 'o':
12596                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12597                 {
12598                     const char* error_msg;
12599                     bool valid = grok_bslash_o(&RExC_parse,
12600                                                &value,
12601                                                &error_msg,
12602                                                SIZE_ONLY,   /* warnings in pass
12603                                                                1 only */
12604                                                strict,
12605                                                silence_non_portable,
12606                                                UTF);
12607                     if (! valid) {
12608                         vFAIL(error_msg);
12609                     }
12610                 }
12611                 if (PL_encoding && value < 0x100) {
12612                     goto recode_encoding;
12613                 }
12614                 break;
12615             case 'x':
12616                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12617                 {
12618                     const char* error_msg;
12619                     bool valid = grok_bslash_x(&RExC_parse,
12620                                                &value,
12621                                                &error_msg,
12622                                                TRUE, /* Output warnings */
12623                                                strict,
12624                                                silence_non_portable,
12625                                                UTF);
12626                     if (! valid) {
12627                         vFAIL(error_msg);
12628                     }
12629                 }
12630                 if (PL_encoding && value < 0x100)
12631                     goto recode_encoding;
12632                 break;
12633             case 'c':
12634                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12635                 break;
12636             case '0': case '1': case '2': case '3': case '4':
12637             case '5': case '6': case '7':
12638                 {
12639                     /* Take 1-3 octal digits */
12640                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12641                     numlen = (strict) ? 4 : 3;
12642                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12643                     RExC_parse += numlen;
12644                     if (numlen != 3) {
12645                         if (strict) {
12646                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12647                             vFAIL("Need exactly 3 octal digits");
12648                         }
12649                         else if (! SIZE_ONLY /* like \08, \178 */
12650                                  && numlen < 3
12651                                  && RExC_parse < RExC_end
12652                                  && isDIGIT(*RExC_parse)
12653                                  && ckWARN(WARN_REGEXP))
12654                         {
12655                             SAVEFREESV(RExC_rx_sv);
12656                             reg_warn_non_literal_string(
12657                                  RExC_parse + 1,
12658                                  form_short_octal_warning(RExC_parse, numlen));
12659                             (void)ReREFCNT_inc(RExC_rx_sv);
12660                         }
12661                     }
12662                     if (PL_encoding && value < 0x100)
12663                         goto recode_encoding;
12664                     break;
12665                 }
12666             recode_encoding:
12667                 if (! RExC_override_recoding) {
12668                     SV* enc = PL_encoding;
12669                     value = reg_recode((const char)(U8)value, &enc);
12670                     if (!enc) {
12671                         if (strict) {
12672                             vFAIL("Invalid escape in the specified encoding");
12673                         }
12674                         else if (SIZE_ONLY) {
12675                             ckWARNreg(RExC_parse,
12676                                   "Invalid escape in the specified encoding");
12677                         }
12678                     }
12679                     break;
12680                 }
12681             default:
12682                 /* Allow \_ to not give an error */
12683                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12684                     if (strict) {
12685                         vFAIL2("Unrecognized escape \\%c in character class",
12686                                (int)value);
12687                     }
12688                     else {
12689                         SAVEFREESV(RExC_rx_sv);
12690                         ckWARN2reg(RExC_parse,
12691                             "Unrecognized escape \\%c in character class passed through",
12692                             (int)value);
12693                         (void)ReREFCNT_inc(RExC_rx_sv);
12694                     }
12695                 }
12696                 break;
12697             }   /* End of switch on char following backslash */
12698         } /* end of handling backslash escape sequences */
12699 #ifdef EBCDIC
12700         else
12701             literal_endpoint++;
12702 #endif
12703
12704         /* Here, we have the current token in 'value' */
12705
12706         /* What matches in a locale is not known until runtime.  This includes
12707          * what the Posix classes (like \w, [:space:]) match.  Room must be
12708          * reserved (one time per class) to store such classes, either if Perl
12709          * is compiled so that locale nodes always should have this space, or
12710          * if there is such class info to be stored.  The space will contain a
12711          * bit for each named class that is to be matched against.  This isn't
12712          * needed for \p{} and pseudo-classes, as they are not affected by
12713          * locale, and hence are dealt with separately */
12714         if (LOC
12715             && ! need_class
12716             && (ANYOF_LOCALE == ANYOF_CLASS
12717                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12718         {
12719             need_class = 1;
12720             if (SIZE_ONLY) {
12721                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12722             }
12723             else {
12724                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12725                 ANYOF_CLASS_ZERO(ret);
12726             }
12727             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12728         }
12729
12730         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12731
12732             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12733              * literal, as is the character that began the false range, i.e.
12734              * the 'a' in the examples */
12735             if (range) {
12736                 if (!SIZE_ONLY) {
12737                     const int w = (RExC_parse >= rangebegin)
12738                                   ? RExC_parse - rangebegin
12739                                   : 0;
12740                     if (strict) {
12741                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12742                     }
12743                     else {
12744                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12745                         ckWARN4reg(RExC_parse,
12746                                 "False [] range \"%*.*s\"",
12747                                 w, w, rangebegin);
12748                         (void)ReREFCNT_inc(RExC_rx_sv);
12749                         cp_list = add_cp_to_invlist(cp_list, '-');
12750                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12751                     }
12752                 }
12753
12754                 range = 0; /* this was not a true range */
12755                 element_count += 2; /* So counts for three values */
12756             }
12757
12758             if (! SIZE_ONLY) {
12759                 U8 classnum = namedclass_to_classnum(namedclass);
12760                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12761                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12762
12763                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12764                          * /l make a difference in what these match.  There
12765                          * would be problems if these characters had folds
12766                          * other than themselves, as cp_list is subject to
12767                          * folding. */
12768                         if (classnum != _CC_VERTSPACE) {
12769                             assert(   namedclass == ANYOF_HORIZWS
12770                                    || namedclass == ANYOF_NHORIZWS);
12771
12772                             /* It turns out that \h is just a synonym for
12773                              * XPosixBlank */
12774                             classnum = _CC_BLANK;
12775                         }
12776
12777                         _invlist_union_maybe_complement_2nd(
12778                                 cp_list,
12779                                 PL_XPosix_ptrs[classnum],
12780                                 cBOOL(namedclass % 2), /* Complement if odd
12781                                                           (NHORIZWS, NVERTWS)
12782                                                         */
12783                                 &cp_list);
12784                     }
12785                 }
12786                 else if (classnum == _CC_ASCII) {
12787 #ifdef HAS_ISASCII
12788                     if (LOC) {
12789                         ANYOF_CLASS_SET(ret, namedclass);
12790                     }
12791                     else
12792 #endif  /* Not isascii(); just use the hard-coded definition for it */
12793                         _invlist_union_maybe_complement_2nd(
12794                                 posixes,
12795                                 PL_ASCII,
12796                                 cBOOL(namedclass % 2), /* Complement if odd
12797                                                           (NASCII) */
12798                                 &posixes);
12799                 }
12800                 else {  /* Garden variety class */
12801
12802                     /* The ascii range inversion list */
12803                     SV* ascii_source = PL_Posix_ptrs[classnum];
12804
12805                     /* The full Latin1 range inversion list */
12806                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12807
12808                     /* This code is structured into two major clauses.  The
12809                      * first is for classes whose complete definitions may not
12810                      * already be known.  It not, the Latin1 definition
12811                      * (guaranteed to already known) is used plus code is
12812                      * generated to load the rest at run-time (only if needed).
12813                      * If the complete definition is known, it drops down to
12814                      * the second clause, where the complete definition is
12815                      * known */
12816
12817                     if (classnum < _FIRST_NON_SWASH_CC) {
12818
12819                         /* Here, the class has a swash, which may or not
12820                          * already be loaded */
12821
12822                         /* The name of the property to use to match the full
12823                          * eXtended Unicode range swash for this character
12824                          * class */
12825                         const char *Xname = swash_property_names[classnum];
12826
12827                         /* If returning the inversion list, we can't defer
12828                          * getting this until runtime */
12829                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12830                             PL_utf8_swash_ptrs[classnum] =
12831                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12832                                              1, /* binary */
12833                                              0, /* not tr/// */
12834                                              NULL, /* No inversion list */
12835                                              NULL  /* No flags */
12836                                             );
12837                             assert(PL_utf8_swash_ptrs[classnum]);
12838                         }
12839                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12840                             if (namedclass % 2 == 0) { /* A non-complemented
12841                                                           class */
12842                                 /* If not /a matching, there are code points we
12843                                  * don't know at compile time.  Arrange for the
12844                                  * unknown matches to be loaded at run-time, if
12845                                  * needed */
12846                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12847                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12848                                                                  Xname);
12849                                 }
12850                                 if (LOC) {  /* Under locale, set run-time
12851                                                lookup */
12852                                     ANYOF_CLASS_SET(ret, namedclass);
12853                                 }
12854                                 else {
12855                                     /* Add the current class's code points to
12856                                      * the running total */
12857                                     _invlist_union(posixes,
12858                                                    (AT_LEAST_ASCII_RESTRICTED)
12859                                                         ? ascii_source
12860                                                         : l1_source,
12861                                                    &posixes);
12862                                 }
12863                             }
12864                             else {  /* A complemented class */
12865                                 if (AT_LEAST_ASCII_RESTRICTED) {
12866                                     /* Under /a should match everything above
12867                                      * ASCII, plus the complement of the set's
12868                                      * ASCII matches */
12869                                     _invlist_union_complement_2nd(posixes,
12870                                                                   ascii_source,
12871                                                                   &posixes);
12872                                 }
12873                                 else {
12874                                     /* Arrange for the unknown matches to be
12875                                      * loaded at run-time, if needed */
12876                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12877                                                                  Xname);
12878                                     runtime_posix_matches_above_Unicode = TRUE;
12879                                     if (LOC) {
12880                                         ANYOF_CLASS_SET(ret, namedclass);
12881                                     }
12882                                     else {
12883
12884                                         /* We want to match everything in
12885                                          * Latin1, except those things that
12886                                          * l1_source matches */
12887                                         SV* scratch_list = NULL;
12888                                         _invlist_subtract(PL_Latin1, l1_source,
12889                                                           &scratch_list);
12890
12891                                         /* Add the list from this class to the
12892                                          * running total */
12893                                         if (! posixes) {
12894                                             posixes = scratch_list;
12895                                         }
12896                                         else {
12897                                             _invlist_union(posixes,
12898                                                            scratch_list,
12899                                                            &posixes);
12900                                             SvREFCNT_dec_NN(scratch_list);
12901                                         }
12902                                         if (DEPENDS_SEMANTICS) {
12903                                             ANYOF_FLAGS(ret)
12904                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12905                                         }
12906                                     }
12907                                 }
12908                             }
12909                             goto namedclass_done;
12910                         }
12911
12912                         /* Here, there is a swash loaded for the class.  If no
12913                          * inversion list for it yet, get it */
12914                         if (! PL_XPosix_ptrs[classnum]) {
12915                             PL_XPosix_ptrs[classnum]
12916                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12917                         }
12918                     }
12919
12920                     /* Here there is an inversion list already loaded for the
12921                      * entire class */
12922
12923                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12924                                                    like ANYOF_PUNCT */
12925                         if (! LOC) {
12926                             /* For non-locale, just add it to any existing list
12927                              * */
12928                             _invlist_union(posixes,
12929                                            (AT_LEAST_ASCII_RESTRICTED)
12930                                                ? ascii_source
12931                                                : PL_XPosix_ptrs[classnum],
12932                                            &posixes);
12933                         }
12934                         else {  /* Locale */
12935                             SV* scratch_list = NULL;
12936
12937                             /* For above Latin1 code points, we use the full
12938                              * Unicode range */
12939                             _invlist_intersection(PL_AboveLatin1,
12940                                                   PL_XPosix_ptrs[classnum],
12941                                                   &scratch_list);
12942                             /* And set the output to it, adding instead if
12943                              * there already is an output.  Checking if
12944                              * 'posixes' is NULL first saves an extra clone.
12945                              * Its reference count will be decremented at the
12946                              * next union, etc, or if this is the only
12947                              * instance, at the end of the routine */
12948                             if (! posixes) {
12949                                 posixes = scratch_list;
12950                             }
12951                             else {
12952                                 _invlist_union(posixes, scratch_list, &posixes);
12953                                 SvREFCNT_dec_NN(scratch_list);
12954                             }
12955
12956 #ifndef HAS_ISBLANK
12957                             if (namedclass != ANYOF_BLANK) {
12958 #endif
12959                                 /* Set this class in the node for runtime
12960                                  * matching */
12961                                 ANYOF_CLASS_SET(ret, namedclass);
12962 #ifndef HAS_ISBLANK
12963                             }
12964                             else {
12965                                 /* No isblank(), use the hard-coded ASCII-range
12966                                  * blanks, adding them to the running total. */
12967
12968                                 _invlist_union(posixes, ascii_source, &posixes);
12969                             }
12970 #endif
12971                         }
12972                     }
12973                     else {  /* A complemented class, like ANYOF_NPUNCT */
12974                         if (! LOC) {
12975                             _invlist_union_complement_2nd(
12976                                                 posixes,
12977                                                 (AT_LEAST_ASCII_RESTRICTED)
12978                                                     ? ascii_source
12979                                                     : PL_XPosix_ptrs[classnum],
12980                                                 &posixes);
12981                             /* Under /d, everything in the upper half of the
12982                              * Latin1 range matches this complement */
12983                             if (DEPENDS_SEMANTICS) {
12984                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12985                             }
12986                         }
12987                         else {  /* Locale */
12988                             SV* scratch_list = NULL;
12989                             _invlist_subtract(PL_AboveLatin1,
12990                                               PL_XPosix_ptrs[classnum],
12991                                               &scratch_list);
12992                             if (! posixes) {
12993                                 posixes = scratch_list;
12994                             }
12995                             else {
12996                                 _invlist_union(posixes, scratch_list, &posixes);
12997                                 SvREFCNT_dec_NN(scratch_list);
12998                             }
12999 #ifndef HAS_ISBLANK
13000                             if (namedclass != ANYOF_NBLANK) {
13001 #endif
13002                                 ANYOF_CLASS_SET(ret, namedclass);
13003 #ifndef HAS_ISBLANK
13004                             }
13005                             else {
13006                                 /* Get the list of all code points in Latin1
13007                                  * that are not ASCII blanks, and add them to
13008                                  * the running total */
13009                                 _invlist_subtract(PL_Latin1, ascii_source,
13010                                                   &scratch_list);
13011                                 _invlist_union(posixes, scratch_list, &posixes);
13012                                 SvREFCNT_dec_NN(scratch_list);
13013                             }
13014 #endif
13015                         }
13016                     }
13017                 }
13018               namedclass_done:
13019                 continue;   /* Go get next character */
13020             }
13021         } /* end of namedclass \blah */
13022
13023         /* Here, we have a single value.  If 'range' is set, it is the ending
13024          * of a range--check its validity.  Later, we will handle each
13025          * individual code point in the range.  If 'range' isn't set, this
13026          * could be the beginning of a range, so check for that by looking
13027          * ahead to see if the next real character to be processed is the range
13028          * indicator--the minus sign */
13029
13030         if (skip_white) {
13031             RExC_parse = regpatws(pRExC_state, RExC_parse,
13032                                 FALSE /* means don't recognize comments */);
13033         }
13034
13035         if (range) {
13036             if (prevvalue > value) /* b-a */ {
13037                 const int w = RExC_parse - rangebegin;
13038                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13039                 range = 0; /* not a valid range */
13040             }
13041         }
13042         else {
13043             prevvalue = value; /* save the beginning of the potential range */
13044             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13045                 && *RExC_parse == '-')
13046             {
13047                 char* next_char_ptr = RExC_parse + 1;
13048                 if (skip_white) {   /* Get the next real char after the '-' */
13049                     next_char_ptr = regpatws(pRExC_state,
13050                                              RExC_parse + 1,
13051                                              FALSE); /* means don't recognize
13052                                                         comments */
13053                 }
13054
13055                 /* If the '-' is at the end of the class (just before the ']',
13056                  * it is a literal minus; otherwise it is a range */
13057                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13058                     RExC_parse = next_char_ptr;
13059
13060                     /* a bad range like \w-, [:word:]- ? */
13061                     if (namedclass > OOB_NAMEDCLASS) {
13062                         if (strict || ckWARN(WARN_REGEXP)) {
13063                             const int w =
13064                                 RExC_parse >= rangebegin ?
13065                                 RExC_parse - rangebegin : 0;
13066                             if (strict) {
13067                                 vFAIL4("False [] range \"%*.*s\"",
13068                                     w, w, rangebegin);
13069                             }
13070                             else {
13071                                 vWARN4(RExC_parse,
13072                                     "False [] range \"%*.*s\"",
13073                                     w, w, rangebegin);
13074                             }
13075                         }
13076                         if (!SIZE_ONLY) {
13077                             cp_list = add_cp_to_invlist(cp_list, '-');
13078                         }
13079                         element_count++;
13080                     } else
13081                         range = 1;      /* yeah, it's a range! */
13082                     continue;   /* but do it the next time */
13083                 }
13084             }
13085         }
13086
13087         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13088          * if not */
13089
13090         /* non-Latin1 code point implies unicode semantics.  Must be set in
13091          * pass1 so is there for the whole of pass 2 */
13092         if (value > 255) {
13093             RExC_uni_semantics = 1;
13094         }
13095
13096         /* Ready to process either the single value, or the completed range.
13097          * For single-valued non-inverted ranges, we consider the possibility
13098          * of multi-char folds.  (We made a conscious decision to not do this
13099          * for the other cases because it can often lead to non-intuitive
13100          * results.  For example, you have the peculiar case that:
13101          *  "s s" =~ /^[^\xDF]+$/i => Y
13102          *  "ss"  =~ /^[^\xDF]+$/i => N
13103          *
13104          * See [perl #89750] */
13105         if (FOLD && allow_multi_folds && value == prevvalue) {
13106             if (value == LATIN_SMALL_LETTER_SHARP_S
13107                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13108                                                         value)))
13109             {
13110                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13111
13112                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13113                 STRLEN foldlen;
13114
13115                 UV folded = _to_uni_fold_flags(
13116                                 value,
13117                                 foldbuf,
13118                                 &foldlen,
13119                                 FOLD_FLAGS_FULL
13120                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
13121                                             : (ASCII_FOLD_RESTRICTED)
13122                                               ? FOLD_FLAGS_NOMIX_ASCII
13123                                               : 0)
13124                                 );
13125
13126                 /* Here, <folded> should be the first character of the
13127                  * multi-char fold of <value>, with <foldbuf> containing the
13128                  * whole thing.  But, if this fold is not allowed (because of
13129                  * the flags), <fold> will be the same as <value>, and should
13130                  * be processed like any other character, so skip the special
13131                  * handling */
13132                 if (folded != value) {
13133
13134                     /* Skip if we are recursed, currently parsing the class
13135                      * again.  Otherwise add this character to the list of
13136                      * multi-char folds. */
13137                     if (! RExC_in_multi_char_class) {
13138                         AV** this_array_ptr;
13139                         AV* this_array;
13140                         STRLEN cp_count = utf8_length(foldbuf,
13141                                                       foldbuf + foldlen);
13142                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13143
13144                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13145
13146
13147                         if (! multi_char_matches) {
13148                             multi_char_matches = newAV();
13149                         }
13150
13151                         /* <multi_char_matches> is actually an array of arrays.
13152                          * There will be one or two top-level elements: [2],
13153                          * and/or [3].  The [2] element is an array, each
13154                          * element thereof is a character which folds to TWO
13155                          * characters; [3] is for folds to THREE characters.
13156                          * (Unicode guarantees a maximum of 3 characters in any
13157                          * fold.)  When we rewrite the character class below,
13158                          * we will do so such that the longest folds are
13159                          * written first, so that it prefers the longest
13160                          * matching strings first.  This is done even if it
13161                          * turns out that any quantifier is non-greedy, out of
13162                          * programmer laziness.  Tom Christiansen has agreed
13163                          * that this is ok.  This makes the test for the
13164                          * ligature 'ffi' come before the test for 'ff' */
13165                         if (av_exists(multi_char_matches, cp_count)) {
13166                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
13167                                                              cp_count, FALSE);
13168                             this_array = *this_array_ptr;
13169                         }
13170                         else {
13171                             this_array = newAV();
13172                             av_store(multi_char_matches, cp_count,
13173                                      (SV*) this_array);
13174                         }
13175                         av_push(this_array, multi_fold);
13176                     }
13177
13178                     /* This element should not be processed further in this
13179                      * class */
13180                     element_count--;
13181                     value = save_value;
13182                     prevvalue = save_prevvalue;
13183                     continue;
13184                 }
13185             }
13186         }
13187
13188         /* Deal with this element of the class */
13189         if (! SIZE_ONLY) {
13190 #ifndef EBCDIC
13191             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13192 #else
13193             SV* this_range = _new_invlist(1);
13194             _append_range_to_invlist(this_range, prevvalue, value);
13195
13196             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13197              * If this range was specified using something like 'i-j', we want
13198              * to include only the 'i' and the 'j', and not anything in
13199              * between, so exclude non-ASCII, non-alphabetics from it.
13200              * However, if the range was specified with something like
13201              * [\x89-\x91] or [\x89-j], all code points within it should be
13202              * included.  literal_endpoint==2 means both ends of the range used
13203              * a literal character, not \x{foo} */
13204             if (literal_endpoint == 2
13205                 && ((prevvalue >= 'a' && value <= 'z')
13206                     || (prevvalue >= 'A' && value <= 'Z')))
13207             {
13208                 _invlist_intersection(this_range, PL_ASCII,
13209                                       &this_range);
13210                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13211                                       &this_range);
13212             }
13213             _invlist_union(cp_list, this_range, &cp_list);
13214             literal_endpoint = 0;
13215 #endif
13216         }
13217
13218         range = 0; /* this range (if it was one) is done now */
13219     } /* End of loop through all the text within the brackets */
13220
13221     /* If anything in the class expands to more than one character, we have to
13222      * deal with them by building up a substitute parse string, and recursively
13223      * calling reg() on it, instead of proceeding */
13224     if (multi_char_matches) {
13225         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13226         I32 cp_count;
13227         STRLEN len;
13228         char *save_end = RExC_end;
13229         char *save_parse = RExC_parse;
13230         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13231                                        a "|" */
13232         I32 reg_flags;
13233
13234         assert(! invert);
13235 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13236            because too confusing */
13237         if (invert) {
13238             sv_catpv(substitute_parse, "(?:");
13239         }
13240 #endif
13241
13242         /* Look at the longest folds first */
13243         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13244
13245             if (av_exists(multi_char_matches, cp_count)) {
13246                 AV** this_array_ptr;
13247                 SV* this_sequence;
13248
13249                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13250                                                  cp_count, FALSE);
13251                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13252                                                                 &PL_sv_undef)
13253                 {
13254                     if (! first_time) {
13255                         sv_catpv(substitute_parse, "|");
13256                     }
13257                     first_time = FALSE;
13258
13259                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13260                 }
13261             }
13262         }
13263
13264         /* If the character class contains anything else besides these
13265          * multi-character folds, have to include it in recursive parsing */
13266         if (element_count) {
13267             sv_catpv(substitute_parse, "|[");
13268             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13269             sv_catpv(substitute_parse, "]");
13270         }
13271
13272         sv_catpv(substitute_parse, ")");
13273 #if 0
13274         if (invert) {
13275             /* This is a way to get the parse to skip forward a whole named
13276              * sequence instead of matching the 2nd character when it fails the
13277              * first */
13278             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13279         }
13280 #endif
13281
13282         RExC_parse = SvPV(substitute_parse, len);
13283         RExC_end = RExC_parse + len;
13284         RExC_in_multi_char_class = 1;
13285         RExC_emit = (regnode *)orig_emit;
13286
13287         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13288
13289         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13290
13291         RExC_parse = save_parse;
13292         RExC_end = save_end;
13293         RExC_in_multi_char_class = 0;
13294         SvREFCNT_dec_NN(multi_char_matches);
13295         return ret;
13296     }
13297
13298     /* If the character class contains only a single element, it may be
13299      * optimizable into another node type which is smaller and runs faster.
13300      * Check if this is the case for this class */
13301     if (element_count == 1 && ! ret_invlist) {
13302         U8 op = END;
13303         U8 arg = 0;
13304
13305         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13306                                               [:digit:] or \p{foo} */
13307
13308             /* All named classes are mapped into POSIXish nodes, with its FLAG
13309              * argument giving which class it is */
13310             switch ((I32)namedclass) {
13311                 case ANYOF_UNIPROP:
13312                     break;
13313
13314                 /* These don't depend on the charset modifiers.  They always
13315                  * match under /u rules */
13316                 case ANYOF_NHORIZWS:
13317                 case ANYOF_HORIZWS:
13318                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13319                     /* FALLTHROUGH */
13320
13321                 case ANYOF_NVERTWS:
13322                 case ANYOF_VERTWS:
13323                     op = POSIXU;
13324                     goto join_posix;
13325
13326                 /* The actual POSIXish node for all the rest depends on the
13327                  * charset modifier.  The ones in the first set depend only on
13328                  * ASCII or, if available on this platform, locale */
13329                 case ANYOF_ASCII:
13330                 case ANYOF_NASCII:
13331 #ifdef HAS_ISASCII
13332                     op = (LOC) ? POSIXL : POSIXA;
13333 #else
13334                     op = POSIXA;
13335 #endif
13336                     goto join_posix;
13337
13338                 case ANYOF_NCASED:
13339                 case ANYOF_LOWER:
13340                 case ANYOF_NLOWER:
13341                 case ANYOF_UPPER:
13342                 case ANYOF_NUPPER:
13343                     /* under /a could be alpha */
13344                     if (FOLD) {
13345                         if (ASCII_RESTRICTED) {
13346                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13347                         }
13348                         else if (! LOC) {
13349                             break;
13350                         }
13351                     }
13352                     /* FALLTHROUGH */
13353
13354                 /* The rest have more possibilities depending on the charset.
13355                  * We take advantage of the enum ordering of the charset
13356                  * modifiers to get the exact node type, */
13357                 default:
13358                     op = POSIXD + get_regex_charset(RExC_flags);
13359                     if (op > POSIXA) { /* /aa is same as /a */
13360                         op = POSIXA;
13361                     }
13362 #ifndef HAS_ISBLANK
13363                     if (op == POSIXL
13364                         && (namedclass == ANYOF_BLANK
13365                             || namedclass == ANYOF_NBLANK))
13366                     {
13367                         op = POSIXA;
13368                     }
13369 #endif
13370
13371                 join_posix:
13372                     /* The odd numbered ones are the complements of the
13373                      * next-lower even number one */
13374                     if (namedclass % 2 == 1) {
13375                         invert = ! invert;
13376                         namedclass--;
13377                     }
13378                     arg = namedclass_to_classnum(namedclass);
13379                     break;
13380             }
13381         }
13382         else if (value == prevvalue) {
13383
13384             /* Here, the class consists of just a single code point */
13385
13386             if (invert) {
13387                 if (! LOC && value == '\n') {
13388                     op = REG_ANY; /* Optimize [^\n] */
13389                     *flagp |= HASWIDTH|SIMPLE;
13390                     RExC_naughty++;
13391                 }
13392             }
13393             else if (value < 256 || UTF) {
13394
13395                 /* Optimize a single value into an EXACTish node, but not if it
13396                  * would require converting the pattern to UTF-8. */
13397                 op = compute_EXACTish(pRExC_state);
13398             }
13399         } /* Otherwise is a range */
13400         else if (! LOC) {   /* locale could vary these */
13401             if (prevvalue == '0') {
13402                 if (value == '9') {
13403                     arg = _CC_DIGIT;
13404                     op = POSIXA;
13405                 }
13406             }
13407         }
13408
13409         /* Here, we have changed <op> away from its initial value iff we found
13410          * an optimization */
13411         if (op != END) {
13412
13413             /* Throw away this ANYOF regnode, and emit the calculated one,
13414              * which should correspond to the beginning, not current, state of
13415              * the parse */
13416             const char * cur_parse = RExC_parse;
13417             RExC_parse = (char *)orig_parse;
13418             if ( SIZE_ONLY) {
13419                 if (! LOC) {
13420
13421                     /* To get locale nodes to not use the full ANYOF size would
13422                      * require moving the code above that writes the portions
13423                      * of it that aren't in other nodes to after this point.
13424                      * e.g.  ANYOF_CLASS_SET */
13425                     RExC_size = orig_size;
13426                 }
13427             }
13428             else {
13429                 RExC_emit = (regnode *)orig_emit;
13430                 if (PL_regkind[op] == POSIXD) {
13431                     if (invert) {
13432                         op += NPOSIXD - POSIXD;
13433                     }
13434                 }
13435             }
13436
13437             ret = reg_node(pRExC_state, op);
13438
13439             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13440                 if (! SIZE_ONLY) {
13441                     FLAGS(ret) = arg;
13442                 }
13443                 *flagp |= HASWIDTH|SIMPLE;
13444             }
13445             else if (PL_regkind[op] == EXACT) {
13446                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13447             }
13448
13449             RExC_parse = (char *) cur_parse;
13450
13451             SvREFCNT_dec(posixes);
13452             SvREFCNT_dec(cp_list);
13453             return ret;
13454         }
13455     }
13456
13457     if (SIZE_ONLY)
13458         return ret;
13459     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13460
13461     /* If folding, we calculate all characters that could fold to or from the
13462      * ones already on the list */
13463     if (FOLD && cp_list) {
13464         UV start, end;  /* End points of code point ranges */
13465
13466         SV* fold_intersection = NULL;
13467
13468         /* If the highest code point is within Latin1, we can use the
13469          * compiled-in Alphas list, and not have to go out to disk.  This
13470          * yields two false positives, the masculine and feminine ordinal
13471          * indicators, which are weeded out below using the
13472          * IS_IN_SOME_FOLD_L1() macro */
13473         if (invlist_highest(cp_list) < 256) {
13474             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13475                                                            &fold_intersection);
13476         }
13477         else {
13478
13479             /* Here, there are non-Latin1 code points, so we will have to go
13480              * fetch the list of all the characters that participate in folds
13481              */
13482             if (! PL_utf8_foldable) {
13483                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13484                                        &PL_sv_undef, 1, 0);
13485                 PL_utf8_foldable = _get_swash_invlist(swash);
13486                 SvREFCNT_dec_NN(swash);
13487             }
13488
13489             /* This is a hash that for a particular fold gives all characters
13490              * that are involved in it */
13491             if (! PL_utf8_foldclosures) {
13492
13493                 /* If we were unable to find any folds, then we likely won't be
13494                  * able to find the closures.  So just create an empty list.
13495                  * Folding will effectively be restricted to the non-Unicode
13496                  * rules hard-coded into Perl.  (This case happens legitimately
13497                  * during compilation of Perl itself before the Unicode tables
13498                  * are generated) */
13499                 if (_invlist_len(PL_utf8_foldable) == 0) {
13500                     PL_utf8_foldclosures = newHV();
13501                 }
13502                 else {
13503                     /* If the folds haven't been read in, call a fold function
13504                      * to force that */
13505                     if (! PL_utf8_tofold) {
13506                         U8 dummy[UTF8_MAXBYTES_CASE+1];
13507
13508                         /* This string is just a short named one above \xff */
13509                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13510                         assert(PL_utf8_tofold); /* Verify that worked */
13511                     }
13512                     PL_utf8_foldclosures =
13513                                     _swash_inversion_hash(PL_utf8_tofold);
13514                 }
13515             }
13516
13517             /* Only the characters in this class that participate in folds need
13518              * be checked.  Get the intersection of this class and all the
13519              * possible characters that are foldable.  This can quickly narrow
13520              * down a large class */
13521             _invlist_intersection(PL_utf8_foldable, cp_list,
13522                                   &fold_intersection);
13523         }
13524
13525         /* Now look at the foldable characters in this class individually */
13526         invlist_iterinit(fold_intersection);
13527         while (invlist_iternext(fold_intersection, &start, &end)) {
13528             UV j;
13529
13530             /* Locale folding for Latin1 characters is deferred until runtime */
13531             if (LOC && start < 256) {
13532                 start = 256;
13533             }
13534
13535             /* Look at every character in the range */
13536             for (j = start; j <= end; j++) {
13537
13538                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13539                 STRLEN foldlen;
13540                 SV** listp;
13541
13542                 if (j < 256) {
13543
13544                     /* We have the latin1 folding rules hard-coded here so that
13545                      * an innocent-looking character class, like /[ks]/i won't
13546                      * have to go out to disk to find the possible matches.
13547                      * XXX It would be better to generate these via regen, in
13548                      * case a new version of the Unicode standard adds new
13549                      * mappings, though that is not really likely, and may be
13550                      * caught by the default: case of the switch below. */
13551
13552                     if (IS_IN_SOME_FOLD_L1(j)) {
13553
13554                         /* ASCII is always matched; non-ASCII is matched only
13555                          * under Unicode rules */
13556                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13557                             cp_list =
13558                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13559                         }
13560                         else {
13561                             depends_list =
13562                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13563                         }
13564                     }
13565
13566                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13567                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13568                     {
13569                         /* Certain Latin1 characters have matches outside
13570                          * Latin1.  To get here, <j> is one of those
13571                          * characters.   None of these matches is valid for
13572                          * ASCII characters under /aa, which is why the 'if'
13573                          * just above excludes those.  These matches only
13574                          * happen when the target string is utf8.  The code
13575                          * below adds the single fold closures for <j> to the
13576                          * inversion list. */
13577                         switch (j) {
13578                             case 'k':
13579                             case 'K':
13580                                 cp_list =
13581                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13582                                 break;
13583                             case 's':
13584                             case 'S':
13585                                 cp_list = add_cp_to_invlist(cp_list,
13586                                                     LATIN_SMALL_LETTER_LONG_S);
13587                                 break;
13588                             case MICRO_SIGN:
13589                                 cp_list = add_cp_to_invlist(cp_list,
13590                                                     GREEK_CAPITAL_LETTER_MU);
13591                                 cp_list = add_cp_to_invlist(cp_list,
13592                                                     GREEK_SMALL_LETTER_MU);
13593                                 break;
13594                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13595                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13596                                 cp_list =
13597                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13598                                 break;
13599                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13600                                 cp_list = add_cp_to_invlist(cp_list,
13601                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13602                                 break;
13603                             case LATIN_SMALL_LETTER_SHARP_S:
13604                                 cp_list = add_cp_to_invlist(cp_list,
13605                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13606                                 break;
13607                             case 'F': case 'f':
13608                             case 'I': case 'i':
13609                             case 'L': case 'l':
13610                             case 'T': case 't':
13611                             case 'A': case 'a':
13612                             case 'H': case 'h':
13613                             case 'J': case 'j':
13614                             case 'N': case 'n':
13615                             case 'W': case 'w':
13616                             case 'Y': case 'y':
13617                                 /* These all are targets of multi-character
13618                                  * folds from code points that require UTF8 to
13619                                  * express, so they can't match unless the
13620                                  * target string is in UTF-8, so no action here
13621                                  * is necessary, as regexec.c properly handles
13622                                  * the general case for UTF-8 matching and
13623                                  * multi-char folds */
13624                                 break;
13625                             default:
13626                                 /* Use deprecated warning to increase the
13627                                  * chances of this being output */
13628                                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13629                                 break;
13630                         }
13631                     }
13632                     continue;
13633                 }
13634
13635                 /* Here is an above Latin1 character.  We don't have the rules
13636                  * hard-coded for it.  First, get its fold.  This is the simple
13637                  * fold, as the multi-character folds have been handled earlier
13638                  * and separated out */
13639                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13640                                                ((LOC)
13641                                                ? FOLD_FLAGS_LOCALE
13642                                                : (ASCII_FOLD_RESTRICTED)
13643                                                   ? FOLD_FLAGS_NOMIX_ASCII
13644                                                   : 0));
13645
13646                 /* Single character fold of above Latin1.  Add everything in
13647                  * its fold closure to the list that this node should match.
13648                  * The fold closures data structure is a hash with the keys
13649                  * being the UTF-8 of every character that is folded to, like
13650                  * 'k', and the values each an array of all code points that
13651                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13652                  * Multi-character folds are not included */
13653                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13654                                       (char *) foldbuf, foldlen, FALSE)))
13655                 {
13656                     AV* list = (AV*) *listp;
13657                     IV k;
13658                     for (k = 0; k <= av_len(list); k++) {
13659                         SV** c_p = av_fetch(list, k, FALSE);
13660                         UV c;
13661                         if (c_p == NULL) {
13662                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13663                         }
13664                         c = SvUV(*c_p);
13665
13666                         /* /aa doesn't allow folds between ASCII and non-; /l
13667                          * doesn't allow them between above and below 256 */
13668                         if ((ASCII_FOLD_RESTRICTED
13669                                   && (isASCII(c) != isASCII(j)))
13670                             || (LOC && c < 256)) {
13671                             continue;
13672                         }
13673
13674                         /* Folds involving non-ascii Latin1 characters
13675                          * under /d are added to a separate list */
13676                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13677                         {
13678                             cp_list = add_cp_to_invlist(cp_list, c);
13679                         }
13680                         else {
13681                           depends_list = add_cp_to_invlist(depends_list, c);
13682                         }
13683                     }
13684                 }
13685             }
13686         }
13687         SvREFCNT_dec_NN(fold_intersection);
13688     }
13689
13690     /* And combine the result (if any) with any inversion list from posix
13691      * classes.  The lists are kept separate up to now because we don't want to
13692      * fold the classes (folding of those is automatically handled by the swash
13693      * fetching code) */
13694     if (posixes) {
13695         if (! DEPENDS_SEMANTICS) {
13696             if (cp_list) {
13697                 _invlist_union(cp_list, posixes, &cp_list);
13698                 SvREFCNT_dec_NN(posixes);
13699             }
13700             else {
13701                 cp_list = posixes;
13702             }
13703         }
13704         else {
13705             /* Under /d, we put into a separate list the Latin1 things that
13706              * match only when the target string is utf8 */
13707             SV* nonascii_but_latin1_properties = NULL;
13708             _invlist_intersection(posixes, PL_Latin1,
13709                                   &nonascii_but_latin1_properties);
13710             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13711                               &nonascii_but_latin1_properties);
13712             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13713                               &posixes);
13714             if (cp_list) {
13715                 _invlist_union(cp_list, posixes, &cp_list);
13716                 SvREFCNT_dec_NN(posixes);
13717             }
13718             else {
13719                 cp_list = posixes;
13720             }
13721
13722             if (depends_list) {
13723                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13724                                &depends_list);
13725                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13726             }
13727             else {
13728                 depends_list = nonascii_but_latin1_properties;
13729             }
13730         }
13731     }
13732
13733     /* And combine the result (if any) with any inversion list from properties.
13734      * The lists are kept separate up to now so that we can distinguish the two
13735      * in regards to matching above-Unicode.  A run-time warning is generated
13736      * if a Unicode property is matched against a non-Unicode code point. But,
13737      * we allow user-defined properties to match anything, without any warning,
13738      * and we also suppress the warning if there is a portion of the character
13739      * class that isn't a Unicode property, and which matches above Unicode, \W
13740      * or [\x{110000}] for example.
13741      * (Note that in this case, unlike the Posix one above, there is no
13742      * <depends_list>, because having a Unicode property forces Unicode
13743      * semantics */
13744     if (properties) {
13745         bool warn_super = ! has_user_defined_property;
13746         if (cp_list) {
13747
13748             /* If it matters to the final outcome, see if a non-property
13749              * component of the class matches above Unicode.  If so, the
13750              * warning gets suppressed.  This is true even if just a single
13751              * such code point is specified, as though not strictly correct if
13752              * another such code point is matched against, the fact that they
13753              * are using above-Unicode code points indicates they should know
13754              * the issues involved */
13755             if (warn_super) {
13756                 bool non_prop_matches_above_Unicode =
13757                             runtime_posix_matches_above_Unicode
13758                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13759                 if (invert) {
13760                     non_prop_matches_above_Unicode =
13761                                             !  non_prop_matches_above_Unicode;
13762                 }
13763                 warn_super = ! non_prop_matches_above_Unicode;
13764             }
13765
13766             _invlist_union(properties, cp_list, &cp_list);
13767             SvREFCNT_dec_NN(properties);
13768         }
13769         else {
13770             cp_list = properties;
13771         }
13772
13773         if (warn_super) {
13774             OP(ret) = ANYOF_WARN_SUPER;
13775         }
13776     }
13777
13778     /* Here, we have calculated what code points should be in the character
13779      * class.
13780      *
13781      * Now we can see about various optimizations.  Fold calculation (which we
13782      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13783      * would invert to include K, which under /i would match k, which it
13784      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13785      * folded until runtime */
13786
13787     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13788      * at compile time.  Besides not inverting folded locale now, we can't
13789      * invert if there are things such as \w, which aren't known until runtime
13790      * */
13791     if (invert
13792         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13793         && ! depends_list
13794         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13795     {
13796         _invlist_invert(cp_list);
13797
13798         /* Any swash can't be used as-is, because we've inverted things */
13799         if (swash) {
13800             SvREFCNT_dec_NN(swash);
13801             swash = NULL;
13802         }
13803
13804         /* Clear the invert flag since have just done it here */
13805         invert = FALSE;
13806     }
13807
13808     if (ret_invlist) {
13809         *ret_invlist = cp_list;
13810         SvREFCNT_dec(swash);
13811
13812         /* Discard the generated node */
13813         if (SIZE_ONLY) {
13814             RExC_size = orig_size;
13815         }
13816         else {
13817             RExC_emit = orig_emit;
13818         }
13819         return orig_emit;
13820     }
13821
13822     /* If we didn't do folding, it's because some information isn't available
13823      * until runtime; set the run-time fold flag for these.  (We don't have to
13824      * worry about properties folding, as that is taken care of by the swash
13825      * fetching) */
13826     if (FOLD && LOC)
13827     {
13828        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13829     }
13830
13831     /* Some character classes are equivalent to other nodes.  Such nodes take
13832      * up less room and generally fewer operations to execute than ANYOF nodes.
13833      * Above, we checked for and optimized into some such equivalents for
13834      * certain common classes that are easy to test.  Getting to this point in
13835      * the code means that the class didn't get optimized there.  Since this
13836      * code is only executed in Pass 2, it is too late to save space--it has
13837      * been allocated in Pass 1, and currently isn't given back.  But turning
13838      * things into an EXACTish node can allow the optimizer to join it to any
13839      * adjacent such nodes.  And if the class is equivalent to things like /./,
13840      * expensive run-time swashes can be avoided.  Now that we have more
13841      * complete information, we can find things necessarily missed by the
13842      * earlier code.  I (khw) am not sure how much to look for here.  It would
13843      * be easy, but perhaps too slow, to check any candidates against all the
13844      * node types they could possibly match using _invlistEQ(). */
13845
13846     if (cp_list
13847         && ! invert
13848         && ! depends_list
13849         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13850         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13851     {
13852         UV start, end;
13853         U8 op = END;  /* The optimzation node-type */
13854         const char * cur_parse= RExC_parse;
13855
13856         invlist_iterinit(cp_list);
13857         if (! invlist_iternext(cp_list, &start, &end)) {
13858
13859             /* Here, the list is empty.  This happens, for example, when a
13860              * Unicode property is the only thing in the character class, and
13861              * it doesn't match anything.  (perluniprops.pod notes such
13862              * properties) */
13863             op = OPFAIL;
13864             *flagp |= HASWIDTH|SIMPLE;
13865         }
13866         else if (start == end) {    /* The range is a single code point */
13867             if (! invlist_iternext(cp_list, &start, &end)
13868
13869                     /* Don't do this optimization if it would require changing
13870                      * the pattern to UTF-8 */
13871                 && (start < 256 || UTF))
13872             {
13873                 /* Here, the list contains a single code point.  Can optimize
13874                  * into an EXACT node */
13875
13876                 value = start;
13877
13878                 if (! FOLD) {
13879                     op = EXACT;
13880                 }
13881                 else if (LOC) {
13882
13883                     /* A locale node under folding with one code point can be
13884                      * an EXACTFL, as its fold won't be calculated until
13885                      * runtime */
13886                     op = EXACTFL;
13887                 }
13888                 else {
13889
13890                     /* Here, we are generally folding, but there is only one
13891                      * code point to match.  If we have to, we use an EXACT
13892                      * node, but it would be better for joining with adjacent
13893                      * nodes in the optimization pass if we used the same
13894                      * EXACTFish node that any such are likely to be.  We can
13895                      * do this iff the code point doesn't participate in any
13896                      * folds.  For example, an EXACTF of a colon is the same as
13897                      * an EXACT one, since nothing folds to or from a colon. */
13898                     if (value < 256) {
13899                         if (IS_IN_SOME_FOLD_L1(value)) {
13900                             op = EXACT;
13901                         }
13902                     }
13903                     else {
13904                         if (! PL_utf8_foldable) {
13905                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13906                                                 &PL_sv_undef, 1, 0);
13907                             PL_utf8_foldable = _get_swash_invlist(swash);
13908                             SvREFCNT_dec_NN(swash);
13909                         }
13910                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13911                             op = EXACT;
13912                         }
13913                     }
13914
13915                     /* If we haven't found the node type, above, it means we
13916                      * can use the prevailing one */
13917                     if (op == END) {
13918                         op = compute_EXACTish(pRExC_state);
13919                     }
13920                 }
13921             }
13922         }
13923         else if (start == 0) {
13924             if (end == UV_MAX) {
13925                 op = SANY;
13926                 *flagp |= HASWIDTH|SIMPLE;
13927                 RExC_naughty++;
13928             }
13929             else if (end == '\n' - 1
13930                     && invlist_iternext(cp_list, &start, &end)
13931                     && start == '\n' + 1 && end == UV_MAX)
13932             {
13933                 op = REG_ANY;
13934                 *flagp |= HASWIDTH|SIMPLE;
13935                 RExC_naughty++;
13936             }
13937         }
13938         invlist_iterfinish(cp_list);
13939
13940         if (op != END) {
13941             RExC_parse = (char *)orig_parse;
13942             RExC_emit = (regnode *)orig_emit;
13943
13944             ret = reg_node(pRExC_state, op);
13945
13946             RExC_parse = (char *)cur_parse;
13947
13948             if (PL_regkind[op] == EXACT) {
13949                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13950             }
13951
13952             SvREFCNT_dec_NN(cp_list);
13953             return ret;
13954         }
13955     }
13956
13957     /* Here, <cp_list> contains all the code points we can determine at
13958      * compile time that match under all conditions.  Go through it, and
13959      * for things that belong in the bitmap, put them there, and delete from
13960      * <cp_list>.  While we are at it, see if everything above 255 is in the
13961      * list, and if so, set a flag to speed up execution */
13962     ANYOF_BITMAP_ZERO(ret);
13963     if (cp_list) {
13964
13965         /* This gets set if we actually need to modify things */
13966         bool change_invlist = FALSE;
13967
13968         UV start, end;
13969
13970         /* Start looking through <cp_list> */
13971         invlist_iterinit(cp_list);
13972         while (invlist_iternext(cp_list, &start, &end)) {
13973             UV high;
13974             int i;
13975
13976             if (end == UV_MAX && start <= 256) {
13977                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13978             }
13979
13980             /* Quit if are above what we should change */
13981             if (start > 255) {
13982                 break;
13983             }
13984
13985             change_invlist = TRUE;
13986
13987             /* Set all the bits in the range, up to the max that we are doing */
13988             high = (end < 255) ? end : 255;
13989             for (i = start; i <= (int) high; i++) {
13990                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13991                     ANYOF_BITMAP_SET(ret, i);
13992                 }
13993             }
13994         }
13995         invlist_iterfinish(cp_list);
13996
13997         /* Done with loop; remove any code points that are in the bitmap from
13998          * <cp_list> */
13999         if (change_invlist) {
14000             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
14001         }
14002
14003         /* If have completely emptied it, remove it completely */
14004         if (_invlist_len(cp_list) == 0) {
14005             SvREFCNT_dec_NN(cp_list);
14006             cp_list = NULL;
14007         }
14008     }
14009
14010     if (invert) {
14011         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14012     }
14013
14014     /* Here, the bitmap has been populated with all the Latin1 code points that
14015      * always match.  Can now add to the overall list those that match only
14016      * when the target string is UTF-8 (<depends_list>). */
14017     if (depends_list) {
14018         if (cp_list) {
14019             _invlist_union(cp_list, depends_list, &cp_list);
14020             SvREFCNT_dec_NN(depends_list);
14021         }
14022         else {
14023             cp_list = depends_list;
14024         }
14025     }
14026
14027     /* If there is a swash and more than one element, we can't use the swash in
14028      * the optimization below. */
14029     if (swash && element_count > 1) {
14030         SvREFCNT_dec_NN(swash);
14031         swash = NULL;
14032     }
14033
14034     if (! cp_list
14035         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14036     {
14037         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14038     }
14039     else {
14040         /* av[0] stores the character class description in its textual form:
14041          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
14042          *       appropriate swash, and is also useful for dumping the regnode.
14043          * av[1] if NULL, is a placeholder to later contain the swash computed
14044          *       from av[0].  But if no further computation need be done, the
14045          *       swash is stored there now.
14046          * av[2] stores the cp_list inversion list for use in addition or
14047          *       instead of av[0]; used only if av[1] is NULL
14048          * av[3] is set if any component of the class is from a user-defined
14049          *       property; used only if av[1] is NULL */
14050         AV * const av = newAV();
14051         SV *rv;
14052
14053         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14054                         ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14055         if (swash) {
14056             av_store(av, 1, swash);
14057             SvREFCNT_dec_NN(cp_list);
14058         }
14059         else {
14060             av_store(av, 1, NULL);
14061             if (cp_list) {
14062                 av_store(av, 2, cp_list);
14063                 av_store(av, 3, newSVuv(has_user_defined_property));
14064             }
14065         }
14066
14067         rv = newRV_noinc(MUTABLE_SV(av));
14068         n = add_data(pRExC_state, 1, "s");
14069         RExC_rxi->data->data[n] = (void*)rv;
14070         ARG_SET(ret, n);
14071     }
14072
14073     *flagp |= HASWIDTH|SIMPLE;
14074     return ret;
14075 }
14076 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14077
14078
14079 /* reg_skipcomment()
14080
14081    Absorbs an /x style # comments from the input stream.
14082    Returns true if there is more text remaining in the stream.
14083    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14084    terminates the pattern without including a newline.
14085
14086    Note its the callers responsibility to ensure that we are
14087    actually in /x mode
14088
14089 */
14090
14091 STATIC bool
14092 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14093 {
14094     bool ended = 0;
14095
14096     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14097
14098     while (RExC_parse < RExC_end)
14099         if (*RExC_parse++ == '\n') {
14100             ended = 1;
14101             break;
14102         }
14103     if (!ended) {
14104         /* we ran off the end of the pattern without ending
14105            the comment, so we have to add an \n when wrapping */
14106         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14107         return 0;
14108     } else
14109         return 1;
14110 }
14111
14112 /* nextchar()
14113
14114    Advances the parse position, and optionally absorbs
14115    "whitespace" from the inputstream.
14116
14117    Without /x "whitespace" means (?#...) style comments only,
14118    with /x this means (?#...) and # comments and whitespace proper.
14119
14120    Returns the RExC_parse point from BEFORE the scan occurs.
14121
14122    This is the /x friendly way of saying RExC_parse++.
14123 */
14124
14125 STATIC char*
14126 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14127 {
14128     char* const retval = RExC_parse++;
14129
14130     PERL_ARGS_ASSERT_NEXTCHAR;
14131
14132     for (;;) {
14133         if (RExC_end - RExC_parse >= 3
14134             && *RExC_parse == '('
14135             && RExC_parse[1] == '?'
14136             && RExC_parse[2] == '#')
14137         {
14138             while (*RExC_parse != ')') {
14139                 if (RExC_parse == RExC_end)
14140                     FAIL("Sequence (?#... not terminated");
14141                 RExC_parse++;
14142             }
14143             RExC_parse++;
14144             continue;
14145         }
14146         if (RExC_flags & RXf_PMf_EXTENDED) {
14147             if (isSPACE(*RExC_parse)) {
14148                 RExC_parse++;
14149                 continue;
14150             }
14151             else if (*RExC_parse == '#') {
14152                 if ( reg_skipcomment( pRExC_state ) )
14153                     continue;
14154             }
14155         }
14156         return retval;
14157     }
14158 }
14159
14160 /*
14161 - reg_node - emit a node
14162 */
14163 STATIC regnode *                        /* Location. */
14164 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14165 {
14166     dVAR;
14167     regnode *ptr;
14168     regnode * const ret = RExC_emit;
14169     GET_RE_DEBUG_FLAGS_DECL;
14170
14171     PERL_ARGS_ASSERT_REG_NODE;
14172
14173     if (SIZE_ONLY) {
14174         SIZE_ALIGN(RExC_size);
14175         RExC_size += 1;
14176         return(ret);
14177     }
14178     if (RExC_emit >= RExC_emit_bound)
14179         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14180                    op, RExC_emit, RExC_emit_bound);
14181
14182     NODE_ALIGN_FILL(ret);
14183     ptr = ret;
14184     FILL_ADVANCE_NODE(ptr, op);
14185 #ifdef RE_TRACK_PATTERN_OFFSETS
14186     if (RExC_offsets) {         /* MJD */
14187         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14188               "reg_node", __LINE__,
14189               PL_reg_name[op],
14190               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14191                 ? "Overwriting end of array!\n" : "OK",
14192               (UV)(RExC_emit - RExC_emit_start),
14193               (UV)(RExC_parse - RExC_start),
14194               (UV)RExC_offsets[0]));
14195         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14196     }
14197 #endif
14198     RExC_emit = ptr;
14199     return(ret);
14200 }
14201
14202 /*
14203 - reganode - emit a node with an argument
14204 */
14205 STATIC regnode *                        /* Location. */
14206 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14207 {
14208     dVAR;
14209     regnode *ptr;
14210     regnode * const ret = RExC_emit;
14211     GET_RE_DEBUG_FLAGS_DECL;
14212
14213     PERL_ARGS_ASSERT_REGANODE;
14214
14215     if (SIZE_ONLY) {
14216         SIZE_ALIGN(RExC_size);
14217         RExC_size += 2;
14218         /*
14219            We can't do this:
14220
14221            assert(2==regarglen[op]+1);
14222
14223            Anything larger than this has to allocate the extra amount.
14224            If we changed this to be:
14225
14226            RExC_size += (1 + regarglen[op]);
14227
14228            then it wouldn't matter. Its not clear what side effect
14229            might come from that so its not done so far.
14230            -- dmq
14231         */
14232         return(ret);
14233     }
14234     if (RExC_emit >= RExC_emit_bound)
14235         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14236                    op, RExC_emit, RExC_emit_bound);
14237
14238     NODE_ALIGN_FILL(ret);
14239     ptr = ret;
14240     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14241 #ifdef RE_TRACK_PATTERN_OFFSETS
14242     if (RExC_offsets) {         /* MJD */
14243         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14244               "reganode",
14245               __LINE__,
14246               PL_reg_name[op],
14247               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14248               "Overwriting end of array!\n" : "OK",
14249               (UV)(RExC_emit - RExC_emit_start),
14250               (UV)(RExC_parse - RExC_start),
14251               (UV)RExC_offsets[0]));
14252         Set_Cur_Node_Offset;
14253     }
14254 #endif
14255     RExC_emit = ptr;
14256     return(ret);
14257 }
14258
14259 /*
14260 - reguni - emit (if appropriate) a Unicode character
14261 */
14262 STATIC STRLEN
14263 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14264 {
14265     dVAR;
14266
14267     PERL_ARGS_ASSERT_REGUNI;
14268
14269     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14270 }
14271
14272 /*
14273 - reginsert - insert an operator in front of already-emitted operand
14274 *
14275 * Means relocating the operand.
14276 */
14277 STATIC void
14278 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14279 {
14280     dVAR;
14281     regnode *src;
14282     regnode *dst;
14283     regnode *place;
14284     const int offset = regarglen[(U8)op];
14285     const int size = NODE_STEP_REGNODE + offset;
14286     GET_RE_DEBUG_FLAGS_DECL;
14287
14288     PERL_ARGS_ASSERT_REGINSERT;
14289     PERL_UNUSED_ARG(depth);
14290 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14291     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14292     if (SIZE_ONLY) {
14293         RExC_size += size;
14294         return;
14295     }
14296
14297     src = RExC_emit;
14298     RExC_emit += size;
14299     dst = RExC_emit;
14300     if (RExC_open_parens) {
14301         int paren;
14302         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14303         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14304             if ( RExC_open_parens[paren] >= opnd ) {
14305                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14306                 RExC_open_parens[paren] += size;
14307             } else {
14308                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14309             }
14310             if ( RExC_close_parens[paren] >= opnd ) {
14311                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14312                 RExC_close_parens[paren] += size;
14313             } else {
14314                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14315             }
14316         }
14317     }
14318
14319     while (src > opnd) {
14320         StructCopy(--src, --dst, regnode);
14321 #ifdef RE_TRACK_PATTERN_OFFSETS
14322         if (RExC_offsets) {     /* MJD 20010112 */
14323             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14324                   "reg_insert",
14325                   __LINE__,
14326                   PL_reg_name[op],
14327                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14328                     ? "Overwriting end of array!\n" : "OK",
14329                   (UV)(src - RExC_emit_start),
14330                   (UV)(dst - RExC_emit_start),
14331                   (UV)RExC_offsets[0]));
14332             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14333             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14334         }
14335 #endif
14336     }
14337
14338
14339     place = opnd;               /* Op node, where operand used to be. */
14340 #ifdef RE_TRACK_PATTERN_OFFSETS
14341     if (RExC_offsets) {         /* MJD */
14342         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14343               "reginsert",
14344               __LINE__,
14345               PL_reg_name[op],
14346               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14347               ? "Overwriting end of array!\n" : "OK",
14348               (UV)(place - RExC_emit_start),
14349               (UV)(RExC_parse - RExC_start),
14350               (UV)RExC_offsets[0]));
14351         Set_Node_Offset(place, RExC_parse);
14352         Set_Node_Length(place, 1);
14353     }
14354 #endif
14355     src = NEXTOPER(place);
14356     FILL_ADVANCE_NODE(place, op);
14357     Zero(src, offset, regnode);
14358 }
14359
14360 /*
14361 - regtail - set the next-pointer at the end of a node chain of p to val.
14362 - SEE ALSO: regtail_study
14363 */
14364 /* TODO: All three parms should be const */
14365 STATIC void
14366 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14367 {
14368     dVAR;
14369     regnode *scan;
14370     GET_RE_DEBUG_FLAGS_DECL;
14371
14372     PERL_ARGS_ASSERT_REGTAIL;
14373 #ifndef DEBUGGING
14374     PERL_UNUSED_ARG(depth);
14375 #endif
14376
14377     if (SIZE_ONLY)
14378         return;
14379
14380     /* Find last node. */
14381     scan = p;
14382     for (;;) {
14383         regnode * const temp = regnext(scan);
14384         DEBUG_PARSE_r({
14385             SV * const mysv=sv_newmortal();
14386             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14387             regprop(RExC_rx, mysv, scan);
14388             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14389                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14390                     (temp == NULL ? "->" : ""),
14391                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14392             );
14393         });
14394         if (temp == NULL)
14395             break;
14396         scan = temp;
14397     }
14398
14399     if (reg_off_by_arg[OP(scan)]) {
14400         ARG_SET(scan, val - scan);
14401     }
14402     else {
14403         NEXT_OFF(scan) = val - scan;
14404     }
14405 }
14406
14407 #ifdef DEBUGGING
14408 /*
14409 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14410 - Look for optimizable sequences at the same time.
14411 - currently only looks for EXACT chains.
14412
14413 This is experimental code. The idea is to use this routine to perform
14414 in place optimizations on branches and groups as they are constructed,
14415 with the long term intention of removing optimization from study_chunk so
14416 that it is purely analytical.
14417
14418 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14419 to control which is which.
14420
14421 */
14422 /* TODO: All four parms should be const */
14423
14424 STATIC U8
14425 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14426 {
14427     dVAR;
14428     regnode *scan;
14429     U8 exact = PSEUDO;
14430 #ifdef EXPERIMENTAL_INPLACESCAN
14431     I32 min = 0;
14432 #endif
14433     GET_RE_DEBUG_FLAGS_DECL;
14434
14435     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14436
14437
14438     if (SIZE_ONLY)
14439         return exact;
14440
14441     /* Find last node. */
14442
14443     scan = p;
14444     for (;;) {
14445         regnode * const temp = regnext(scan);
14446 #ifdef EXPERIMENTAL_INPLACESCAN
14447         if (PL_regkind[OP(scan)] == EXACT) {
14448             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14449             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14450                 return EXACT;
14451         }
14452 #endif
14453         if ( exact ) {
14454             switch (OP(scan)) {
14455                 case EXACT:
14456                 case EXACTF:
14457                 case EXACTFA_NO_TRIE:
14458                 case EXACTFA:
14459                 case EXACTFU:
14460                 case EXACTFU_SS:
14461                 case EXACTFL:
14462                         if( exact == PSEUDO )
14463                             exact= OP(scan);
14464                         else if ( exact != OP(scan) )
14465                             exact= 0;
14466                 case NOTHING:
14467                     break;
14468                 default:
14469                     exact= 0;
14470             }
14471         }
14472         DEBUG_PARSE_r({
14473             SV * const mysv=sv_newmortal();
14474             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14475             regprop(RExC_rx, mysv, scan);
14476             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14477                 SvPV_nolen_const(mysv),
14478                 REG_NODE_NUM(scan),
14479                 PL_reg_name[exact]);
14480         });
14481         if (temp == NULL)
14482             break;
14483         scan = temp;
14484     }
14485     DEBUG_PARSE_r({
14486         SV * const mysv_val=sv_newmortal();
14487         DEBUG_PARSE_MSG("");
14488         regprop(RExC_rx, mysv_val, val);
14489         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14490                       SvPV_nolen_const(mysv_val),
14491                       (IV)REG_NODE_NUM(val),
14492                       (IV)(val - scan)
14493         );
14494     });
14495     if (reg_off_by_arg[OP(scan)]) {
14496         ARG_SET(scan, val - scan);
14497     }
14498     else {
14499         NEXT_OFF(scan) = val - scan;
14500     }
14501
14502     return exact;
14503 }
14504 #endif
14505
14506 /*
14507  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14508  */
14509 #ifdef DEBUGGING
14510
14511 static void
14512 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14513 {
14514     int bit;
14515     int set=0;
14516
14517     for (bit=0; bit<32; bit++) {
14518         if (flags & (1<<bit)) {
14519             if (!set++ && lead)
14520                 PerlIO_printf(Perl_debug_log, "%s",lead);
14521             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14522         }
14523     }
14524     if (lead)  {
14525         if (set)
14526             PerlIO_printf(Perl_debug_log, "\n");
14527         else
14528             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14529     }
14530 }
14531
14532 static void
14533 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14534 {
14535     int bit;
14536     int set=0;
14537     regex_charset cs;
14538
14539     for (bit=0; bit<32; bit++) {
14540         if (flags & (1<<bit)) {
14541             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14542                 continue;
14543             }
14544             if (!set++ && lead)
14545                 PerlIO_printf(Perl_debug_log, "%s",lead);
14546             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14547         }
14548     }
14549     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14550             if (!set++ && lead) {
14551                 PerlIO_printf(Perl_debug_log, "%s",lead);
14552             }
14553             switch (cs) {
14554                 case REGEX_UNICODE_CHARSET:
14555                     PerlIO_printf(Perl_debug_log, "UNICODE");
14556                     break;
14557                 case REGEX_LOCALE_CHARSET:
14558                     PerlIO_printf(Perl_debug_log, "LOCALE");
14559                     break;
14560                 case REGEX_ASCII_RESTRICTED_CHARSET:
14561                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14562                     break;
14563                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14564                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14565                     break;
14566                 default:
14567                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14568                     break;
14569             }
14570     }
14571     if (lead)  {
14572         if (set)
14573             PerlIO_printf(Perl_debug_log, "\n");
14574         else
14575             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14576     }
14577 }
14578 #endif
14579
14580 void
14581 Perl_regdump(pTHX_ const regexp *r)
14582 {
14583 #ifdef DEBUGGING
14584     dVAR;
14585     SV * const sv = sv_newmortal();
14586     SV *dsv= sv_newmortal();
14587     RXi_GET_DECL(r,ri);
14588     GET_RE_DEBUG_FLAGS_DECL;
14589
14590     PERL_ARGS_ASSERT_REGDUMP;
14591
14592     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14593
14594     /* Header fields of interest. */
14595     if (r->anchored_substr) {
14596         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14597             RE_SV_DUMPLEN(r->anchored_substr), 30);
14598         PerlIO_printf(Perl_debug_log,
14599                       "anchored %s%s at %"IVdf" ",
14600                       s, RE_SV_TAIL(r->anchored_substr),
14601                       (IV)r->anchored_offset);
14602     } else if (r->anchored_utf8) {
14603         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14604             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14605         PerlIO_printf(Perl_debug_log,
14606                       "anchored utf8 %s%s at %"IVdf" ",
14607                       s, RE_SV_TAIL(r->anchored_utf8),
14608                       (IV)r->anchored_offset);
14609     }
14610     if (r->float_substr) {
14611         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14612             RE_SV_DUMPLEN(r->float_substr), 30);
14613         PerlIO_printf(Perl_debug_log,
14614                       "floating %s%s at %"IVdf"..%"UVuf" ",
14615                       s, RE_SV_TAIL(r->float_substr),
14616                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14617     } else if (r->float_utf8) {
14618         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14619             RE_SV_DUMPLEN(r->float_utf8), 30);
14620         PerlIO_printf(Perl_debug_log,
14621                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14622                       s, RE_SV_TAIL(r->float_utf8),
14623                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14624     }
14625     if (r->check_substr || r->check_utf8)
14626         PerlIO_printf(Perl_debug_log,
14627                       (const char *)
14628                       (r->check_substr == r->float_substr
14629                        && r->check_utf8 == r->float_utf8
14630                        ? "(checking floating" : "(checking anchored"));
14631     if (r->extflags & RXf_NOSCAN)
14632         PerlIO_printf(Perl_debug_log, " noscan");
14633     if (r->extflags & RXf_CHECK_ALL)
14634         PerlIO_printf(Perl_debug_log, " isall");
14635     if (r->check_substr || r->check_utf8)
14636         PerlIO_printf(Perl_debug_log, ") ");
14637
14638     if (ri->regstclass) {
14639         regprop(r, sv, ri->regstclass);
14640         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14641     }
14642     if (r->extflags & RXf_ANCH) {
14643         PerlIO_printf(Perl_debug_log, "anchored");
14644         if (r->extflags & RXf_ANCH_BOL)
14645             PerlIO_printf(Perl_debug_log, "(BOL)");
14646         if (r->extflags & RXf_ANCH_MBOL)
14647             PerlIO_printf(Perl_debug_log, "(MBOL)");
14648         if (r->extflags & RXf_ANCH_SBOL)
14649             PerlIO_printf(Perl_debug_log, "(SBOL)");
14650         if (r->extflags & RXf_ANCH_GPOS)
14651             PerlIO_printf(Perl_debug_log, "(GPOS)");
14652         PerlIO_putc(Perl_debug_log, ' ');
14653     }
14654     if (r->extflags & RXf_GPOS_SEEN)
14655         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14656     if (r->intflags & PREGf_SKIP)
14657         PerlIO_printf(Perl_debug_log, "plus ");
14658     if (r->intflags & PREGf_IMPLICIT)
14659         PerlIO_printf(Perl_debug_log, "implicit ");
14660     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14661     if (r->extflags & RXf_EVAL_SEEN)
14662         PerlIO_printf(Perl_debug_log, "with eval ");
14663     PerlIO_printf(Perl_debug_log, "\n");
14664     DEBUG_FLAGS_r({
14665         regdump_extflags("r->extflags: ",r->extflags);
14666         regdump_intflags("r->intflags: ",r->intflags);
14667     });
14668 #else
14669     PERL_ARGS_ASSERT_REGDUMP;
14670     PERL_UNUSED_CONTEXT;
14671     PERL_UNUSED_ARG(r);
14672 #endif  /* DEBUGGING */
14673 }
14674
14675 /*
14676 - regprop - printable representation of opcode
14677 */
14678 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14679 STMT_START { \
14680         if (do_sep) {                           \
14681             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14682             if (flags & ANYOF_INVERT)           \
14683                 /*make sure the invert info is in each */ \
14684                 sv_catpvs(sv, "^");             \
14685             do_sep = 0;                         \
14686         }                                       \
14687 } STMT_END
14688
14689 void
14690 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14691 {
14692 #ifdef DEBUGGING
14693     dVAR;
14694     int k;
14695
14696     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14697     static const char * const anyofs[] = {
14698 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14699     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14700     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14701     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14702     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14703     || _CC_VERTSPACE != 16
14704   #error Need to adjust order of anyofs[]
14705 #endif
14706         "[\\w]",
14707         "[\\W]",
14708         "[\\d]",
14709         "[\\D]",
14710         "[:alpha:]",
14711         "[:^alpha:]",
14712         "[:lower:]",
14713         "[:^lower:]",
14714         "[:upper:]",
14715         "[:^upper:]",
14716         "[:punct:]",
14717         "[:^punct:]",
14718         "[:print:]",
14719         "[:^print:]",
14720         "[:alnum:]",
14721         "[:^alnum:]",
14722         "[:graph:]",
14723         "[:^graph:]",
14724         "[:cased:]",
14725         "[:^cased:]",
14726         "[\\s]",
14727         "[\\S]",
14728         "[:blank:]",
14729         "[:^blank:]",
14730         "[:xdigit:]",
14731         "[:^xdigit:]",
14732         "[:space:]",
14733         "[:^space:]",
14734         "[:cntrl:]",
14735         "[:^cntrl:]",
14736         "[:ascii:]",
14737         "[:^ascii:]",
14738         "[\\v]",
14739         "[\\V]"
14740     };
14741     RXi_GET_DECL(prog,progi);
14742     GET_RE_DEBUG_FLAGS_DECL;
14743
14744     PERL_ARGS_ASSERT_REGPROP;
14745
14746     sv_setpvs(sv, "");
14747
14748     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14749         /* It would be nice to FAIL() here, but this may be called from
14750            regexec.c, and it would be hard to supply pRExC_state. */
14751         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14752     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14753
14754     k = PL_regkind[OP(o)];
14755
14756     if (k == EXACT) {
14757         sv_catpvs(sv, " ");
14758         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14759          * is a crude hack but it may be the best for now since
14760          * we have no flag "this EXACTish node was UTF-8"
14761          * --jhi */
14762         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14763                   PERL_PV_ESCAPE_UNI_DETECT |
14764                   PERL_PV_ESCAPE_NONASCII   |
14765                   PERL_PV_PRETTY_ELLIPSES   |
14766                   PERL_PV_PRETTY_LTGT       |
14767                   PERL_PV_PRETTY_NOCLEAR
14768                   );
14769     } else if (k == TRIE) {
14770         /* print the details of the trie in dumpuntil instead, as
14771          * progi->data isn't available here */
14772         const char op = OP(o);
14773         const U32 n = ARG(o);
14774         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14775                (reg_ac_data *)progi->data->data[n] :
14776                NULL;
14777         const reg_trie_data * const trie
14778             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14779
14780         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14781         DEBUG_TRIE_COMPILE_r(
14782             Perl_sv_catpvf(aTHX_ sv,
14783                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14784                 (UV)trie->startstate,
14785                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14786                 (UV)trie->wordcount,
14787                 (UV)trie->minlen,
14788                 (UV)trie->maxlen,
14789                 (UV)TRIE_CHARCOUNT(trie),
14790                 (UV)trie->uniquecharcount
14791             )
14792         );
14793         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14794             sv_catpvs(sv, "[");
14795             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14796                                                    ? ANYOF_BITMAP(o)
14797                                                    : TRIE_BITMAP(trie));
14798             sv_catpvs(sv, "]");
14799         }
14800
14801     } else if (k == CURLY) {
14802         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14803             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14804         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14805     }
14806     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14807         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14808     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14809         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14810         if ( RXp_PAREN_NAMES(prog) ) {
14811             if ( k != REF || (OP(o) < NREF)) {
14812                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14813                 SV **name= av_fetch(list, ARG(o), 0 );
14814                 if (name)
14815                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14816             }
14817             else {
14818                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14819                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14820                 I32 *nums=(I32*)SvPVX(sv_dat);
14821                 SV **name= av_fetch(list, nums[0], 0 );
14822                 I32 n;
14823                 if (name) {
14824                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14825                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14826                                     (n ? "," : ""), (IV)nums[n]);
14827                     }
14828                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14829                 }
14830             }
14831         }
14832     } else if (k == GOSUB)
14833         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14834     else if (k == VERB) {
14835         if (!o->flags)
14836             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14837                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14838     } else if (k == LOGICAL)
14839         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14840     else if (k == ANYOF) {
14841         const U8 flags = ANYOF_FLAGS(o);
14842         int do_sep = 0;
14843
14844
14845         if (flags & ANYOF_LOCALE)
14846             sv_catpvs(sv, "{loc}");
14847         if (flags & ANYOF_LOC_FOLD)
14848             sv_catpvs(sv, "{i}");
14849         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14850         if (flags & ANYOF_INVERT)
14851             sv_catpvs(sv, "^");
14852
14853         /* output what the standard cp 0-255 bitmap matches */
14854         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14855
14856         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14857         /* output any special charclass tests (used entirely under use locale) */
14858         if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14859             int i;
14860             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14861                 if (ANYOF_CLASS_TEST(o,i)) {
14862                     sv_catpv(sv, anyofs[i]);
14863                     do_sep = 1;
14864                 }
14865             }
14866         }
14867
14868         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14869
14870         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14871             sv_catpvs(sv, "{non-utf8-latin1-all}");
14872         }
14873
14874         /* output information about the unicode matching */
14875         if (flags & ANYOF_UNICODE_ALL)
14876             sv_catpvs(sv, "{unicode_all}");
14877         else if (ANYOF_NONBITMAP(o)) {
14878             SV *lv; /* Set if there is something outside the bit map. */
14879             bool byte_output = FALSE;   /* If something in the bitmap has been
14880                                            output */
14881
14882             if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14883                 sv_catpvs(sv, "{outside bitmap}");
14884             }
14885             else {
14886                 sv_catpvs(sv, "{utf8}");
14887             }
14888
14889             /* Get the stuff that wasn't in the bitmap */
14890             (void) regclass_swash(prog, o, FALSE, &lv, NULL);
14891             if (lv && lv != &PL_sv_undef) {
14892                 char *s = savesvpv(lv);
14893                 char * const origs = s;
14894
14895                 while (*s && *s != '\n')
14896                     s++;
14897
14898                 if (*s == '\n') {
14899                     const char * const t = ++s;
14900
14901                     if (byte_output) {
14902                         sv_catpvs(sv, " ");
14903                     }
14904
14905                     while (*s) {
14906                         if (*s == '\n') {
14907
14908                             /* Truncate very long output */
14909                             if (s - origs > 256) {
14910                                 Perl_sv_catpvf(aTHX_ sv,
14911                                                "%.*s...",
14912                                                (int) (s - origs - 1),
14913                                                t);
14914                                 goto out_dump;
14915                             }
14916                             *s = ' ';
14917                         }
14918                         else if (*s == '\t') {
14919                             *s = '-';
14920                         }
14921                         s++;
14922                     }
14923                     if (s[-1] == ' ')
14924                         s[-1] = 0;
14925
14926                     sv_catpv(sv, t);
14927                 }
14928
14929             out_dump:
14930
14931                 Safefree(origs);
14932                 SvREFCNT_dec_NN(lv);
14933             }
14934         }
14935
14936         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14937     }
14938     else if (k == POSIXD || k == NPOSIXD) {
14939         U8 index = FLAGS(o) * 2;
14940         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14941             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14942         }
14943         else {
14944             sv_catpv(sv, anyofs[index]);
14945         }
14946     }
14947     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14948         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14949 #else
14950     PERL_UNUSED_CONTEXT;
14951     PERL_UNUSED_ARG(sv);
14952     PERL_UNUSED_ARG(o);
14953     PERL_UNUSED_ARG(prog);
14954 #endif  /* DEBUGGING */
14955 }
14956
14957 SV *
14958 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14959 {                               /* Assume that RE_INTUIT is set */
14960     dVAR;
14961     struct regexp *const prog = ReANY(r);
14962     GET_RE_DEBUG_FLAGS_DECL;
14963
14964     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14965     PERL_UNUSED_CONTEXT;
14966
14967     DEBUG_COMPILE_r(
14968         {
14969             const char * const s = SvPV_nolen_const(prog->check_substr
14970                       ? prog->check_substr : prog->check_utf8);
14971
14972             if (!PL_colorset) reginitcolors();
14973             PerlIO_printf(Perl_debug_log,
14974                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14975                       PL_colors[4],
14976                       prog->check_substr ? "" : "utf8 ",
14977                       PL_colors[5],PL_colors[0],
14978                       s,
14979                       PL_colors[1],
14980                       (strlen(s) > 60 ? "..." : ""));
14981         } );
14982
14983     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14984 }
14985
14986 /*
14987    pregfree()
14988
14989    handles refcounting and freeing the perl core regexp structure. When
14990    it is necessary to actually free the structure the first thing it
14991    does is call the 'free' method of the regexp_engine associated to
14992    the regexp, allowing the handling of the void *pprivate; member
14993    first. (This routine is not overridable by extensions, which is why
14994    the extensions free is called first.)
14995
14996    See regdupe and regdupe_internal if you change anything here.
14997 */
14998 #ifndef PERL_IN_XSUB_RE
14999 void
15000 Perl_pregfree(pTHX_ REGEXP *r)
15001 {
15002     SvREFCNT_dec(r);
15003 }
15004
15005 void
15006 Perl_pregfree2(pTHX_ REGEXP *rx)
15007 {
15008     dVAR;
15009     struct regexp *const r = ReANY(rx);
15010     GET_RE_DEBUG_FLAGS_DECL;
15011
15012     PERL_ARGS_ASSERT_PREGFREE2;
15013
15014     if (r->mother_re) {
15015         ReREFCNT_dec(r->mother_re);
15016     } else {
15017         CALLREGFREE_PVT(rx); /* free the private data */
15018         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15019         Safefree(r->xpv_len_u.xpvlenu_pv);
15020     }
15021     if (r->substrs) {
15022         SvREFCNT_dec(r->anchored_substr);
15023         SvREFCNT_dec(r->anchored_utf8);
15024         SvREFCNT_dec(r->float_substr);
15025         SvREFCNT_dec(r->float_utf8);
15026         Safefree(r->substrs);
15027     }
15028     RX_MATCH_COPY_FREE(rx);
15029 #ifdef PERL_ANY_COW
15030     SvREFCNT_dec(r->saved_copy);
15031 #endif
15032     Safefree(r->offs);
15033     SvREFCNT_dec(r->qr_anoncv);
15034     rx->sv_u.svu_rx = 0;
15035 }
15036
15037 /*  reg_temp_copy()
15038
15039     This is a hacky workaround to the structural issue of match results
15040     being stored in the regexp structure which is in turn stored in
15041     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15042     could be PL_curpm in multiple contexts, and could require multiple
15043     result sets being associated with the pattern simultaneously, such
15044     as when doing a recursive match with (??{$qr})
15045
15046     The solution is to make a lightweight copy of the regexp structure
15047     when a qr// is returned from the code executed by (??{$qr}) this
15048     lightweight copy doesn't actually own any of its data except for
15049     the starp/end and the actual regexp structure itself.
15050
15051 */
15052
15053
15054 REGEXP *
15055 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15056 {
15057     struct regexp *ret;
15058     struct regexp *const r = ReANY(rx);
15059     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15060
15061     PERL_ARGS_ASSERT_REG_TEMP_COPY;
15062
15063     if (!ret_x)
15064         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15065     else {
15066         SvOK_off((SV *)ret_x);
15067         if (islv) {
15068             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15069                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15070                made both spots point to the same regexp body.) */
15071             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15072             assert(!SvPVX(ret_x));
15073             ret_x->sv_u.svu_rx = temp->sv_any;
15074             temp->sv_any = NULL;
15075             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15076             SvREFCNT_dec_NN(temp);
15077             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15078                ing below will not set it. */
15079             SvCUR_set(ret_x, SvCUR(rx));
15080         }
15081     }
15082     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15083        sv_force_normal(sv) is called.  */
15084     SvFAKE_on(ret_x);
15085     ret = ReANY(ret_x);
15086
15087     SvFLAGS(ret_x) |= SvUTF8(rx);
15088     /* We share the same string buffer as the original regexp, on which we
15089        hold a reference count, incremented when mother_re is set below.
15090        The string pointer is copied here, being part of the regexp struct.
15091      */
15092     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15093            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15094     if (r->offs) {
15095         const I32 npar = r->nparens+1;
15096         Newx(ret->offs, npar, regexp_paren_pair);
15097         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15098     }
15099     if (r->substrs) {
15100         Newx(ret->substrs, 1, struct reg_substr_data);
15101         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15102
15103         SvREFCNT_inc_void(ret->anchored_substr);
15104         SvREFCNT_inc_void(ret->anchored_utf8);
15105         SvREFCNT_inc_void(ret->float_substr);
15106         SvREFCNT_inc_void(ret->float_utf8);
15107
15108         /* check_substr and check_utf8, if non-NULL, point to either their
15109            anchored or float namesakes, and don't hold a second reference.  */
15110     }
15111     RX_MATCH_COPIED_off(ret_x);
15112 #ifdef PERL_ANY_COW
15113     ret->saved_copy = NULL;
15114 #endif
15115     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15116     SvREFCNT_inc_void(ret->qr_anoncv);
15117
15118     return ret_x;
15119 }
15120 #endif
15121
15122 /* regfree_internal()
15123
15124    Free the private data in a regexp. This is overloadable by
15125    extensions. Perl takes care of the regexp structure in pregfree(),
15126    this covers the *pprivate pointer which technically perl doesn't
15127    know about, however of course we have to handle the
15128    regexp_internal structure when no extension is in use.
15129
15130    Note this is called before freeing anything in the regexp
15131    structure.
15132  */
15133
15134 void
15135 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15136 {
15137     dVAR;
15138     struct regexp *const r = ReANY(rx);
15139     RXi_GET_DECL(r,ri);
15140     GET_RE_DEBUG_FLAGS_DECL;
15141
15142     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15143
15144     DEBUG_COMPILE_r({
15145         if (!PL_colorset)
15146             reginitcolors();
15147         {
15148             SV *dsv= sv_newmortal();
15149             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15150                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15151             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15152                 PL_colors[4],PL_colors[5],s);
15153         }
15154     });
15155 #ifdef RE_TRACK_PATTERN_OFFSETS
15156     if (ri->u.offsets)
15157         Safefree(ri->u.offsets);             /* 20010421 MJD */
15158 #endif
15159     if (ri->code_blocks) {
15160         int n;
15161         for (n = 0; n < ri->num_code_blocks; n++)
15162             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15163         Safefree(ri->code_blocks);
15164     }
15165
15166     if (ri->data) {
15167         int n = ri->data->count;
15168
15169         while (--n >= 0) {
15170           /* If you add a ->what type here, update the comment in regcomp.h */
15171             switch (ri->data->what[n]) {
15172             case 'a':
15173             case 'r':
15174             case 's':
15175             case 'S':
15176             case 'u':
15177                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15178                 break;
15179             case 'f':
15180                 Safefree(ri->data->data[n]);
15181                 break;
15182             case 'l':
15183             case 'L':
15184                 break;
15185             case 'T':
15186                 { /* Aho Corasick add-on structure for a trie node.
15187                      Used in stclass optimization only */
15188                     U32 refcount;
15189                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15190                     OP_REFCNT_LOCK;
15191                     refcount = --aho->refcount;
15192                     OP_REFCNT_UNLOCK;
15193                     if ( !refcount ) {
15194                         PerlMemShared_free(aho->states);
15195                         PerlMemShared_free(aho->fail);
15196                          /* do this last!!!! */
15197                         PerlMemShared_free(ri->data->data[n]);
15198                         PerlMemShared_free(ri->regstclass);
15199                     }
15200                 }
15201                 break;
15202             case 't':
15203                 {
15204                     /* trie structure. */
15205                     U32 refcount;
15206                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15207                     OP_REFCNT_LOCK;
15208                     refcount = --trie->refcount;
15209                     OP_REFCNT_UNLOCK;
15210                     if ( !refcount ) {
15211                         PerlMemShared_free(trie->charmap);
15212                         PerlMemShared_free(trie->states);
15213                         PerlMemShared_free(trie->trans);
15214                         if (trie->bitmap)
15215                             PerlMemShared_free(trie->bitmap);
15216                         if (trie->jump)
15217                             PerlMemShared_free(trie->jump);
15218                         PerlMemShared_free(trie->wordinfo);
15219                         /* do this last!!!! */
15220                         PerlMemShared_free(ri->data->data[n]);
15221                     }
15222                 }
15223                 break;
15224             default:
15225                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15226             }
15227         }
15228         Safefree(ri->data->what);
15229         Safefree(ri->data);
15230     }
15231
15232     Safefree(ri);
15233 }
15234
15235 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15236 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15237 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15238
15239 /*
15240    re_dup - duplicate a regexp.
15241
15242    This routine is expected to clone a given regexp structure. It is only
15243    compiled under USE_ITHREADS.
15244
15245    After all of the core data stored in struct regexp is duplicated
15246    the regexp_engine.dupe method is used to copy any private data
15247    stored in the *pprivate pointer. This allows extensions to handle
15248    any duplication it needs to do.
15249
15250    See pregfree() and regfree_internal() if you change anything here.
15251 */
15252 #if defined(USE_ITHREADS)
15253 #ifndef PERL_IN_XSUB_RE
15254 void
15255 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15256 {
15257     dVAR;
15258     I32 npar;
15259     const struct regexp *r = ReANY(sstr);
15260     struct regexp *ret = ReANY(dstr);
15261
15262     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15263
15264     npar = r->nparens+1;
15265     Newx(ret->offs, npar, regexp_paren_pair);
15266     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15267
15268     if (ret->substrs) {
15269         /* Do it this way to avoid reading from *r after the StructCopy().
15270            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15271            cache, it doesn't matter.  */
15272         const bool anchored = r->check_substr
15273             ? r->check_substr == r->anchored_substr
15274             : r->check_utf8 == r->anchored_utf8;
15275         Newx(ret->substrs, 1, struct reg_substr_data);
15276         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15277
15278         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15279         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15280         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15281         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15282
15283         /* check_substr and check_utf8, if non-NULL, point to either their
15284            anchored or float namesakes, and don't hold a second reference.  */
15285
15286         if (ret->check_substr) {
15287             if (anchored) {
15288                 assert(r->check_utf8 == r->anchored_utf8);
15289                 ret->check_substr = ret->anchored_substr;
15290                 ret->check_utf8 = ret->anchored_utf8;
15291             } else {
15292                 assert(r->check_substr == r->float_substr);
15293                 assert(r->check_utf8 == r->float_utf8);
15294                 ret->check_substr = ret->float_substr;
15295                 ret->check_utf8 = ret->float_utf8;
15296             }
15297         } else if (ret->check_utf8) {
15298             if (anchored) {
15299                 ret->check_utf8 = ret->anchored_utf8;
15300             } else {
15301                 ret->check_utf8 = ret->float_utf8;
15302             }
15303         }
15304     }
15305
15306     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15307     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15308
15309     if (ret->pprivate)
15310         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15311
15312     if (RX_MATCH_COPIED(dstr))
15313         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15314     else
15315         ret->subbeg = NULL;
15316 #ifdef PERL_ANY_COW
15317     ret->saved_copy = NULL;
15318 #endif
15319
15320     /* Whether mother_re be set or no, we need to copy the string.  We
15321        cannot refrain from copying it when the storage points directly to
15322        our mother regexp, because that's
15323                1: a buffer in a different thread
15324                2: something we no longer hold a reference on
15325                so we need to copy it locally.  */
15326     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15327     ret->mother_re   = NULL;
15328 }
15329 #endif /* PERL_IN_XSUB_RE */
15330
15331 /*
15332    regdupe_internal()
15333
15334    This is the internal complement to regdupe() which is used to copy
15335    the structure pointed to by the *pprivate pointer in the regexp.
15336    This is the core version of the extension overridable cloning hook.
15337    The regexp structure being duplicated will be copied by perl prior
15338    to this and will be provided as the regexp *r argument, however
15339    with the /old/ structures pprivate pointer value. Thus this routine
15340    may override any copying normally done by perl.
15341
15342    It returns a pointer to the new regexp_internal structure.
15343 */
15344
15345 void *
15346 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15347 {
15348     dVAR;
15349     struct regexp *const r = ReANY(rx);
15350     regexp_internal *reti;
15351     int len;
15352     RXi_GET_DECL(r,ri);
15353
15354     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15355
15356     len = ProgLen(ri);
15357
15358     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15359     Copy(ri->program, reti->program, len+1, regnode);
15360
15361     reti->num_code_blocks = ri->num_code_blocks;
15362     if (ri->code_blocks) {
15363         int n;
15364         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15365                 struct reg_code_block);
15366         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15367                 struct reg_code_block);
15368         for (n = 0; n < ri->num_code_blocks; n++)
15369              reti->code_blocks[n].src_regex = (REGEXP*)
15370                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15371     }
15372     else
15373         reti->code_blocks = NULL;
15374
15375     reti->regstclass = NULL;
15376
15377     if (ri->data) {
15378         struct reg_data *d;
15379         const int count = ri->data->count;
15380         int i;
15381
15382         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15383                 char, struct reg_data);
15384         Newx(d->what, count, U8);
15385
15386         d->count = count;
15387         for (i = 0; i < count; i++) {
15388             d->what[i] = ri->data->what[i];
15389             switch (d->what[i]) {
15390                 /* see also regcomp.h and regfree_internal() */
15391             case 'a': /* actually an AV, but the dup function is identical.  */
15392             case 'r':
15393             case 's':
15394             case 'S':
15395             case 'u': /* actually an HV, but the dup function is identical.  */
15396                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15397                 break;
15398             case 'f':
15399                 /* This is cheating. */
15400                 Newx(d->data[i], 1, struct regnode_charclass_class);
15401                 StructCopy(ri->data->data[i], d->data[i],
15402                             struct regnode_charclass_class);
15403                 reti->regstclass = (regnode*)d->data[i];
15404                 break;
15405             case 'T':
15406                 /* Trie stclasses are readonly and can thus be shared
15407                  * without duplication. We free the stclass in pregfree
15408                  * when the corresponding reg_ac_data struct is freed.
15409                  */
15410                 reti->regstclass= ri->regstclass;
15411                 /* Fall through */
15412             case 't':
15413                 OP_REFCNT_LOCK;
15414                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15415                 OP_REFCNT_UNLOCK;
15416                 /* Fall through */
15417             case 'l':
15418             case 'L':
15419                 d->data[i] = ri->data->data[i];
15420                 break;
15421             default:
15422                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15423             }
15424         }
15425
15426         reti->data = d;
15427     }
15428     else
15429         reti->data = NULL;
15430
15431     reti->name_list_idx = ri->name_list_idx;
15432
15433 #ifdef RE_TRACK_PATTERN_OFFSETS
15434     if (ri->u.offsets) {
15435         Newx(reti->u.offsets, 2*len+1, U32);
15436         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15437     }
15438 #else
15439     SetProgLen(reti,len);
15440 #endif
15441
15442     return (void*)reti;
15443 }
15444
15445 #endif    /* USE_ITHREADS */
15446
15447 #ifndef PERL_IN_XSUB_RE
15448
15449 /*
15450  - regnext - dig the "next" pointer out of a node
15451  */
15452 regnode *
15453 Perl_regnext(pTHX_ regnode *p)
15454 {
15455     dVAR;
15456     I32 offset;
15457
15458     if (!p)
15459         return(NULL);
15460
15461     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15462         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15463     }
15464
15465     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15466     if (offset == 0)
15467         return(NULL);
15468
15469     return(p+offset);
15470 }
15471 #endif
15472
15473 STATIC void
15474 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15475 {
15476     va_list args;
15477     STRLEN l1 = strlen(pat1);
15478     STRLEN l2 = strlen(pat2);
15479     char buf[512];
15480     SV *msv;
15481     const char *message;
15482
15483     PERL_ARGS_ASSERT_RE_CROAK2;
15484
15485     if (l1 > 510)
15486         l1 = 510;
15487     if (l1 + l2 > 510)
15488         l2 = 510 - l1;
15489     Copy(pat1, buf, l1 , char);
15490     Copy(pat2, buf + l1, l2 , char);
15491     buf[l1 + l2] = '\n';
15492     buf[l1 + l2 + 1] = '\0';
15493 #ifdef I_STDARG
15494     /* ANSI variant takes additional second argument */
15495     va_start(args, pat2);
15496 #else
15497     va_start(args);
15498 #endif
15499     msv = vmess(buf, &args);
15500     va_end(args);
15501     message = SvPV_const(msv,l1);
15502     if (l1 > 512)
15503         l1 = 512;
15504     Copy(message, buf, l1 , char);
15505     buf[l1-1] = '\0';                   /* Overwrite \n */
15506     Perl_croak(aTHX_ "%s", buf);
15507 }
15508
15509 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15510
15511 #ifndef PERL_IN_XSUB_RE
15512 void
15513 Perl_save_re_context(pTHX)
15514 {
15515     dVAR;
15516
15517     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15518     if (PL_curpm) {
15519         const REGEXP * const rx = PM_GETRE(PL_curpm);
15520         if (rx) {
15521             U32 i;
15522             for (i = 1; i <= RX_NPARENS(rx); i++) {
15523                 char digits[TYPE_CHARS(long)];
15524                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15525                 GV *const *const gvp
15526                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15527
15528                 if (gvp) {
15529                     GV * const gv = *gvp;
15530                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15531                         save_scalar(gv);
15532                 }
15533             }
15534         }
15535     }
15536 }
15537 #endif
15538
15539 #ifdef DEBUGGING
15540
15541 STATIC void
15542 S_put_byte(pTHX_ SV *sv, int c)
15543 {
15544     PERL_ARGS_ASSERT_PUT_BYTE;
15545
15546     /* Our definition of isPRINT() ignores locales, so only bytes that are
15547        not part of UTF-8 are considered printable. I assume that the same
15548        holds for UTF-EBCDIC.
15549        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15550        which Wikipedia says:
15551
15552        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15553        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15554        identical, to the ASCII delete (DEL) or rubout control character. ...
15555        it is typically mapped to hexadecimal code 9F, in order to provide a
15556        unique character mapping in both directions)
15557
15558        So the old condition can be simplified to !isPRINT(c)  */
15559     if (!isPRINT(c)) {
15560         switch (c) {
15561             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15562             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15563             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15564             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15565             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15566
15567             default:
15568                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15569                 break;
15570         }
15571     }
15572     else {
15573         const char string = c;
15574         if (c == '-' || c == ']' || c == '\\' || c == '^')
15575             sv_catpvs(sv, "\\");
15576         sv_catpvn(sv, &string, 1);
15577     }
15578 }
15579
15580 STATIC bool
15581 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15582 {
15583     /* Appends to 'sv' a displayable version of the innards of the bracketed
15584      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
15585      * output anything */
15586
15587     int i;
15588     int rangestart = -1;
15589     bool has_output_anything = FALSE;
15590
15591     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15592
15593     for (i = 0; i <= 256; i++) {
15594         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15595             if (rangestart == -1)
15596                 rangestart = i;
15597         } else if (rangestart != -1) {
15598             int j = i - 1;
15599             if (i <= rangestart + 3) {  /* Individual chars in short ranges */
15600                 for (; rangestart < i; rangestart++)
15601                     put_byte(sv, rangestart);
15602             }
15603             else if (   j > 255
15604                      || ! isALPHANUMERIC(rangestart)
15605                      || ! isALPHANUMERIC(j)
15606                      || isDIGIT(rangestart) != isDIGIT(j)
15607                      || isUPPER(rangestart) != isUPPER(j)
15608                      || isLOWER(rangestart) != isLOWER(j)
15609
15610                         /* This final test should get optimized out except
15611                          * on EBCDIC platforms, where it causes ranges that
15612                          * cross discontinuities like i/j to be shown as hex
15613                          * instead of the misleading, e.g. H-K (since that
15614                          * range includes more than H, I, J, K). */
15615                      || (j - rangestart)
15616                          != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15617             {
15618                 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15619                                rangestart,
15620                                (j < 256) ? j : 255);
15621             }
15622             else { /* Here, the ends of the range are both digits, or both
15623                       uppercase, or both lowercase; and there's no
15624                       discontinuity in the range (which could happen on EBCDIC
15625                       platforms) */
15626                 put_byte(sv, rangestart);
15627                 sv_catpvs(sv, "-");
15628                 put_byte(sv, j);
15629             }
15630             rangestart = -1;
15631             has_output_anything = TRUE;
15632         }
15633     }
15634
15635     return has_output_anything;
15636 }
15637
15638 #define CLEAR_OPTSTART \
15639     if (optstart) STMT_START { \
15640             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15641             optstart=NULL; \
15642     } STMT_END
15643
15644 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15645
15646 STATIC const regnode *
15647 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15648             const regnode *last, const regnode *plast,
15649             SV* sv, I32 indent, U32 depth)
15650 {
15651     dVAR;
15652     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15653     const regnode *next;
15654     const regnode *optstart= NULL;
15655
15656     RXi_GET_DECL(r,ri);
15657     GET_RE_DEBUG_FLAGS_DECL;
15658
15659     PERL_ARGS_ASSERT_DUMPUNTIL;
15660
15661 #ifdef DEBUG_DUMPUNTIL
15662     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15663         last ? last-start : 0,plast ? plast-start : 0);
15664 #endif
15665
15666     if (plast && plast < last)
15667         last= plast;
15668
15669     while (PL_regkind[op] != END && (!last || node < last)) {
15670         /* While that wasn't END last time... */
15671         NODE_ALIGN(node);
15672         op = OP(node);
15673         if (op == CLOSE || op == WHILEM)
15674             indent--;
15675         next = regnext((regnode *)node);
15676
15677         /* Where, what. */
15678         if (OP(node) == OPTIMIZED) {
15679             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15680                 optstart = node;
15681             else
15682                 goto after_print;
15683         } else
15684             CLEAR_OPTSTART;
15685
15686         regprop(r, sv, node);
15687         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15688                       (int)(2*indent + 1), "", SvPVX_const(sv));
15689
15690         if (OP(node) != OPTIMIZED) {
15691             if (next == NULL)           /* Next ptr. */
15692                 PerlIO_printf(Perl_debug_log, " (0)");
15693             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15694                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15695             else
15696                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15697             (void)PerlIO_putc(Perl_debug_log, '\n');
15698         }
15699
15700       after_print:
15701         if (PL_regkind[(U8)op] == BRANCHJ) {
15702             assert(next);
15703             {
15704                 const regnode *nnode = (OP(next) == LONGJMP
15705                                        ? regnext((regnode *)next)
15706                                        : next);
15707                 if (last && nnode > last)
15708                     nnode = last;
15709                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15710             }
15711         }
15712         else if (PL_regkind[(U8)op] == BRANCH) {
15713             assert(next);
15714             DUMPUNTIL(NEXTOPER(node), next);
15715         }
15716         else if ( PL_regkind[(U8)op]  == TRIE ) {
15717             const regnode *this_trie = node;
15718             const char op = OP(node);
15719             const U32 n = ARG(node);
15720             const reg_ac_data * const ac = op>=AHOCORASICK ?
15721                (reg_ac_data *)ri->data->data[n] :
15722                NULL;
15723             const reg_trie_data * const trie =
15724                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15725 #ifdef DEBUGGING
15726             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15727 #endif
15728             const regnode *nextbranch= NULL;
15729             I32 word_idx;
15730             sv_setpvs(sv, "");
15731             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15732                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15733
15734                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15735                    (int)(2*(indent+3)), "",
15736                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15737                             PL_colors[0], PL_colors[1],
15738                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15739                             PERL_PV_PRETTY_ELLIPSES    |
15740                             PERL_PV_PRETTY_LTGT
15741                             )
15742                             : "???"
15743                 );
15744                 if (trie->jump) {
15745                     U16 dist= trie->jump[word_idx+1];
15746                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15747                                   (UV)((dist ? this_trie + dist : next) - start));
15748                     if (dist) {
15749                         if (!nextbranch)
15750                             nextbranch= this_trie + trie->jump[0];
15751                         DUMPUNTIL(this_trie + dist, nextbranch);
15752                     }
15753                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15754                         nextbranch= regnext((regnode *)nextbranch);
15755                 } else {
15756                     PerlIO_printf(Perl_debug_log, "\n");
15757                 }
15758             }
15759             if (last && next > last)
15760                 node= last;
15761             else
15762                 node= next;
15763         }
15764         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15765             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15766                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15767         }
15768         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15769             assert(next);
15770             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15771         }
15772         else if ( op == PLUS || op == STAR) {
15773             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15774         }
15775         else if (PL_regkind[(U8)op] == ANYOF) {
15776             /* arglen 1 + class block */
15777             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15778                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15779             node = NEXTOPER(node);
15780         }
15781         else if (PL_regkind[(U8)op] == EXACT) {
15782             /* Literal string, where present. */
15783             node += NODE_SZ_STR(node) - 1;
15784             node = NEXTOPER(node);
15785         }
15786         else {
15787             node = NEXTOPER(node);
15788             node += regarglen[(U8)op];
15789         }
15790         if (op == CURLYX || op == OPEN)
15791             indent++;
15792     }
15793     CLEAR_OPTSTART;
15794 #ifdef DEBUG_DUMPUNTIL
15795     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15796 #endif
15797     return node;
15798 }
15799
15800 #endif  /* DEBUGGING */
15801
15802 /*
15803  * Local variables:
15804  * c-indentation-style: bsd
15805  * c-basic-offset: 4
15806  * indent-tabs-mode: nil
15807  * End:
15808  *
15809  * ex: set ts=8 sts=4 sw=4 et:
15810  */