regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 struct RExC_state_t {
 110     U32         flags;                  /* RXf_* are we folding, multilining? */
 111     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 112     char        *precomp;               /* uncompiled string. */
 113     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 114     regexp      *rx;                    /* perl core regexp structure */
 115     regexp_internal     *rxi;           /* internal data for regexp object
 116                                            pprivate field */
 117     char        *start;                 /* Start of input for compile */
 118     char        *end;                   /* End of input for compile */
 119     char        *parse;                 /* Input-scan pointer. */
 120     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 121     regnode     *emit_start;            /* Start of emitted-code area */
 122     regnode     *emit_bound;            /* First regnode outside of the
 123                                            allocated space */
 124     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 125                                            implies compiling, so don't emit */
 126     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 127                                            large enough for the largest
 128                                            non-EXACTish node, so can use it as
 129                                            scratch in pass1 */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     SSize_t     size;                   /* Code size. */
 134     I32                npar;            /* Capture buffer count, (OPEN) plus
 135                                            one. ("par" 0 is the whole
 136                                            pattern)*/
 137     I32         nestroot;               /* root parens we are in - used by
 138                                            accept */
 139     I32         extralen;
 140     I32         seen_zerolen;
 141     regnode     **open_parens;          /* pointers to open parens */
 142     regnode     **close_parens;         /* pointers to close parens */
 143     regnode     *opend;                 /* END node in program */
 144     I32         utf8;           /* whether the pattern is utf8 or not */
 145     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 146                                 /* XXX use this for future optimisation of case
 147                                  * where pattern must be upgraded to utf8. */
 148     I32         uni_semantics;  /* If a d charset modifier should use unicode
 149                                    rules, even if the pattern is not in
 150                                    utf8 */
 151     HV          *paren_names;           /* Paren names */
 152
 153     regnode     **recurse;              /* Recurse regops */
 154     I32         recurse_count;          /* Number of recurse regops */
 155     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 156                                            through */
 157     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 158     I32         in_lookbehind;
 159     I32         contains_locale;
 160     I32         contains_i;
 161     I32         override_recoding;
 162     I32         in_multi_char_class;
 163     struct reg_code_block *code_blocks; /* positions of literal (?{})
 164                                             within pattern */
 165     int         num_code_blocks;        /* size of code_blocks[] */
 166     int         code_index;             /* next code_blocks[] slot */
 167     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 168 #ifdef ADD_TO_REGEXEC
 169     char        *starttry;              /* -Dr: where regtry was called. */
 170 #define RExC_starttry   (pRExC_state->starttry)
 171 #endif
 172     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 173 #ifdef DEBUGGING
 174     const char  *lastparse;
 175     I32         lastnum;
 176     AV          *paren_name_list;       /* idx -> name */
 177     U32         study_chunk_recursed_count;
 178 #define RExC_lastparse  (pRExC_state->lastparse)
 179 #define RExC_lastnum    (pRExC_state->lastnum)
 180 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 181 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 182 #endif
 183 };
 184
 185 #define RExC_flags      (pRExC_state->flags)
 186 #define RExC_pm_flags   (pRExC_state->pm_flags)
 187 #define RExC_precomp    (pRExC_state->precomp)
 188 #define RExC_rx_sv      (pRExC_state->rx_sv)
 189 #define RExC_rx         (pRExC_state->rx)
 190 #define RExC_rxi        (pRExC_state->rxi)
 191 #define RExC_start      (pRExC_state->start)
 192 #define RExC_end        (pRExC_state->end)
 193 #define RExC_parse      (pRExC_state->parse)
 194 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 195 #ifdef RE_TRACK_PATTERN_OFFSETS
 196 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 197                                                          others */
 198 #endif
 199 #define RExC_emit       (pRExC_state->emit)
 200 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 201 #define RExC_emit_start (pRExC_state->emit_start)
 202 #define RExC_emit_bound (pRExC_state->emit_bound)
 203 #define RExC_naughty    (pRExC_state->naughty)
 204 #define RExC_sawback    (pRExC_state->sawback)
 205 #define RExC_seen       (pRExC_state->seen)
 206 #define RExC_size       (pRExC_state->size)
 207 #define RExC_maxlen        (pRExC_state->maxlen)
 208 #define RExC_npar       (pRExC_state->npar)
 209 #define RExC_nestroot   (pRExC_state->nestroot)
 210 #define RExC_extralen   (pRExC_state->extralen)
 211 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 212 #define RExC_utf8       (pRExC_state->utf8)
 213 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 214 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 215 #define RExC_open_parens        (pRExC_state->open_parens)
 216 #define RExC_close_parens       (pRExC_state->close_parens)
 217 #define RExC_opend      (pRExC_state->opend)
 218 #define RExC_paren_names        (pRExC_state->paren_names)
 219 #define RExC_recurse    (pRExC_state->recurse)
 220 #define RExC_recurse_count      (pRExC_state->recurse_count)
 221 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 222 #define RExC_study_chunk_recursed_bytes  \
 223                                    (pRExC_state->study_chunk_recursed_bytes)
 224 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 225 #define RExC_contains_locale    (pRExC_state->contains_locale)
 226 #define RExC_contains_i (pRExC_state->contains_i)
 227 #define RExC_override_recoding (pRExC_state->override_recoding)
 228 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 229
 230
 231 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 232 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 233         ((*s) == '{' && regcurly(s)))
 234
 235 /*
 236  * Flags to be passed up and down.
 237  */
 238 #define WORST           0       /* Worst case. */
 239 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 240
 241 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 242  * character.  (There needs to be a case: in the switch statement in regexec.c
 243  * for any node marked SIMPLE.)  Note that this is not the same thing as
 244  * REGNODE_SIMPLE */
 245 #define SIMPLE          0x02
 246 #define SPSTART         0x04    /* Starts with * or + */
 247 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 248 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 249 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 250
 251 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 252
 253 /* whether trie related optimizations are enabled */
 254 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 255 #define TRIE_STUDY_OPT
 256 #define FULL_TRIE_STUDY
 257 #define TRIE_STCLASS
 258 #endif
 259
 260
 261
 262 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 263 #define PBITVAL(paren) (1 << ((paren) & 7))
 264 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 265 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 266 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 267
 268 #define REQUIRE_UTF8    STMT_START {                                       \
 269                                      if (!UTF) {                           \
 270                                          *flagp = RESTART_UTF8;            \
 271                                          return NULL;                      \
 272                                      }                                     \
 273                         } STMT_END
 274
 275 /* This converts the named class defined in regcomp.h to its equivalent class
 276  * number defined in handy.h. */
 277 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 278 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 279
 280 #define _invlist_union_complement_2nd(a, b, output) \
 281                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 282 #define _invlist_intersection_complement_2nd(a, b, output) \
 283                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 284
 285 /* About scan_data_t.
 286
 287   During optimisation we recurse through the regexp program performing
 288   various inplace (keyhole style) optimisations. In addition study_chunk
 289   and scan_commit populate this data structure with information about
 290   what strings MUST appear in the pattern. We look for the longest
 291   string that must appear at a fixed location, and we look for the
 292   longest string that may appear at a floating location. So for instance
 293   in the pattern:
 294
 295     /FOO[xX]A.*B[xX]BAR/
 296
 297   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 298   strings (because they follow a .* construct). study_chunk will identify
 299   both FOO and BAR as being the longest fixed and floating strings respectively.
 300
 301   The strings can be composites, for instance
 302
 303      /(f)(o)(o)/
 304
 305   will result in a composite fixed substring 'foo'.
 306
 307   For each string some basic information is maintained:
 308
 309   - offset or min_offset
 310     This is the position the string must appear at, or not before.
 311     It also implicitly (when combined with minlenp) tells us how many
 312     characters must match before the string we are searching for.
 313     Likewise when combined with minlenp and the length of the string it
 314     tells us how many characters must appear after the string we have
 315     found.
 316
 317   - max_offset
 318     Only used for floating strings. This is the rightmost point that
 319     the string can appear at. If set to SSize_t_MAX it indicates that the
 320     string can occur infinitely far to the right.
 321
 322   - minlenp
 323     A pointer to the minimum number of characters of the pattern that the
 324     string was found inside. This is important as in the case of positive
 325     lookahead or positive lookbehind we can have multiple patterns
 326     involved. Consider
 327
 328     /(?=FOO).*F/
 329
 330     The minimum length of the pattern overall is 3, the minimum length
 331     of the lookahead part is 3, but the minimum length of the part that
 332     will actually match is 1. So 'FOO's minimum length is 3, but the
 333     minimum length for the F is 1. This is important as the minimum length
 334     is used to determine offsets in front of and behind the string being
 335     looked for.  Since strings can be composites this is the length of the
 336     pattern at the time it was committed with a scan_commit. Note that
 337     the length is calculated by study_chunk, so that the minimum lengths
 338     are not known until the full pattern has been compiled, thus the
 339     pointer to the value.
 340
 341   - lookbehind
 342
 343     In the case of lookbehind the string being searched for can be
 344     offset past the start point of the final matching string.
 345     If this value was just blithely removed from the min_offset it would
 346     invalidate some of the calculations for how many chars must match
 347     before or after (as they are derived from min_offset and minlen and
 348     the length of the string being searched for).
 349     When the final pattern is compiled and the data is moved from the
 350     scan_data_t structure into the regexp structure the information
 351     about lookbehind is factored in, with the information that would
 352     have been lost precalculated in the end_shift field for the
 353     associated string.
 354
 355   The fields pos_min and pos_delta are used to store the minimum offset
 356   and the delta to the maximum offset at the current point in the pattern.
 357
 358 */
 359
 360 typedef struct scan_data_t {
 361     /*I32 len_min;      unused */
 362     /*I32 len_delta;    unused */
 363     SSize_t pos_min;
 364     SSize_t pos_delta;
 365     SV *last_found;
 366     SSize_t last_end;       /* min value, <0 unless valid. */
 367     SSize_t last_start_min;
 368     SSize_t last_start_max;
 369     SV **longest;           /* Either &l_fixed, or &l_float. */
 370     SV *longest_fixed;      /* longest fixed string found in pattern */
 371     SSize_t offset_fixed;   /* offset where it starts */
 372     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 373     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 374     SV *longest_float;      /* longest floating string found in pattern */
 375     SSize_t offset_float_min; /* earliest point in string it can appear */
 376     SSize_t offset_float_max; /* latest point in string it can appear */
 377     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 378     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 379     I32 flags;
 380     I32 whilem_c;
 381     SSize_t *last_closep;
 382     regnode_ssc *start_class;
 383 } scan_data_t;
 384
 385 /*
 386  * Forward declarations for pregcomp()'s friends.
 387  */
 388
 389 static const scan_data_t zero_scan_data =
 390   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 391
 392 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 393 #define SF_BEFORE_SEOL          0x0001
 394 #define SF_BEFORE_MEOL          0x0002
 395 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 396 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 397
 398 #define SF_FIX_SHIFT_EOL        (+2)
 399 #define SF_FL_SHIFT_EOL         (+4)
 400
 401 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 402 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 403
 404 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 405 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 406 #define SF_IS_INF               0x0040
 407 #define SF_HAS_PAR              0x0080
 408 #define SF_IN_PAR               0x0100
 409 #define SF_HAS_EVAL             0x0200
 410 #define SCF_DO_SUBSTR           0x0400
 411 #define SCF_DO_STCLASS_AND      0x0800
 412 #define SCF_DO_STCLASS_OR       0x1000
 413 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 414 #define SCF_WHILEM_VISITED_POS  0x2000
 415
 416 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 417 #define SCF_SEEN_ACCEPT         0x8000
 418 #define SCF_TRIE_DOING_RESTUDY 0x10000
 419
 420 #define UTF cBOOL(RExC_utf8)
 421
 422 /* The enums for all these are ordered so things work out correctly */
 423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 425                                                      == REGEX_DEPENDS_CHARSET)
 426 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 427 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 428                                                      >= REGEX_UNICODE_CHARSET)
 429 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 430                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 431 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 432                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 433 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 434                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 435
 436 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 437
 438 /* For programs that want to be strictly Unicode compatible by dying if any
 439  * attempt is made to match a non-Unicode code point against a Unicode
 440  * property.  */
 441 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 442
 443 #define OOB_NAMEDCLASS          -1
 444
 445 /* There is no code point that is out-of-bounds, so this is problematic.  But
 446  * its only current use is to initialize a variable that is always set before
 447  * looked at. */
 448 #define OOB_UNICODE             0xDEADBEEF
 449
 450 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 451 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 452
 453
 454 /* length of regex to show in messages that don't mark a position within */
 455 #define RegexLengthToShowInErrorMessages 127
 456
 457 /*
 458  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 459  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 460  * op/pragma/warn/regcomp.
 461  */
 462 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 463 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 464
 465 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 466                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 467
 468 #define REPORT_LOCATION_ARGS(offset)            \
 469                 UTF8fARG(UTF, offset, RExC_precomp), \
 470                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 471
 472 /*
 473  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 474  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 475  * "...".
 476  */
 477 #define _FAIL(code) STMT_START {                                        \
 478     const char *ellipses = "";                                          \
 479     IV len = RExC_end - RExC_precomp;                                   \
 480                                                                         \
 481     if (!SIZE_ONLY)                                                     \
 482         SAVEFREESV(RExC_rx_sv);                                         \
 483     if (len > RegexLengthToShowInErrorMessages) {                       \
 484         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 485         len = RegexLengthToShowInErrorMessages - 10;                    \
 486         ellipses = "...";                                               \
 487     }                                                                   \
 488     code;                                                               \
 489 } STMT_END
 490
 491 #define FAIL(msg) _FAIL(                            \
 492     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 493             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 494
 495 #define FAIL2(msg,arg) _FAIL(                       \
 496     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 497             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 498
 499 /*
 500  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 501  */
 502 #define Simple_vFAIL(m) STMT_START {                                    \
 503     const IV offset =                                                   \
 504         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 505     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 506             m, REPORT_LOCATION_ARGS(offset));   \
 507 } STMT_END
 508
 509 /*
 510  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 511  */
 512 #define vFAIL(m) STMT_START {                           \
 513     if (!SIZE_ONLY)                                     \
 514         SAVEFREESV(RExC_rx_sv);                         \
 515     Simple_vFAIL(m);                                    \
 516 } STMT_END
 517
 518 /*
 519  * Like Simple_vFAIL(), but accepts two arguments.
 520  */
 521 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 522     const IV offset = RExC_parse - RExC_precomp;                        \
 523     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 524                       REPORT_LOCATION_ARGS(offset));    \
 525 } STMT_END
 526
 527 /*
 528  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 529  */
 530 #define vFAIL2(m,a1) STMT_START {                       \
 531     if (!SIZE_ONLY)                                     \
 532         SAVEFREESV(RExC_rx_sv);                         \
 533     Simple_vFAIL2(m, a1);                               \
 534 } STMT_END
 535
 536
 537 /*
 538  * Like Simple_vFAIL(), but accepts three arguments.
 539  */
 540 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 541     const IV offset = RExC_parse - RExC_precomp;                \
 542     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 543             REPORT_LOCATION_ARGS(offset));      \
 544 } STMT_END
 545
 546 /*
 547  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 548  */
 549 #define vFAIL3(m,a1,a2) STMT_START {                    \
 550     if (!SIZE_ONLY)                                     \
 551         SAVEFREESV(RExC_rx_sv);                         \
 552     Simple_vFAIL3(m, a1, a2);                           \
 553 } STMT_END
 554
 555 /*
 556  * Like Simple_vFAIL(), but accepts four arguments.
 557  */
 558 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 559     const IV offset = RExC_parse - RExC_precomp;                \
 560     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 561             REPORT_LOCATION_ARGS(offset));      \
 562 } STMT_END
 563
 564 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 565     if (!SIZE_ONLY)                                     \
 566         SAVEFREESV(RExC_rx_sv);                         \
 567     Simple_vFAIL4(m, a1, a2, a3);                       \
 568 } STMT_END
 569
 570 /* A specialized version of vFAIL2 that works with UTF8f */
 571 #define vFAIL2utf8f(m, a1) STMT_START { \
 572     const IV offset = RExC_parse - RExC_precomp;   \
 573     if (!SIZE_ONLY)                                \
 574         SAVEFREESV(RExC_rx_sv);                    \
 575     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 576             REPORT_LOCATION_ARGS(offset));         \
 577 } STMT_END
 578
 579 /* These have asserts in them because of [perl #122671] Many warnings in
 580  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 581  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 582  * would get output again.  So they should be output in pass2, and these
 583  * asserts make sure new warnings follow that paradigm. */
 584
 585 /* m is not necessarily a "literal string", in this macro */
 586 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 587     const IV offset = loc - RExC_precomp;                               \
 588     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 589             m, REPORT_LOCATION_ARGS(offset));       \
 590 } STMT_END
 591
 592 #define ckWARNreg(loc,m) STMT_START {                                   \
 593     const IV offset = loc - RExC_precomp;                               \
 594     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 595             REPORT_LOCATION_ARGS(offset));              \
 596 } STMT_END
 597
 598 #define vWARN_dep(loc, m) STMT_START {                                  \
 599     const IV offset = loc - RExC_precomp;                               \
 600     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 601             REPORT_LOCATION_ARGS(offset));              \
 602 } STMT_END
 603
 604 #define ckWARNdep(loc,m) STMT_START {                                   \
 605     const IV offset = loc - RExC_precomp;                               \
 606     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 607             m REPORT_LOCATION,                                          \
 608             REPORT_LOCATION_ARGS(offset));              \
 609 } STMT_END
 610
 611 #define ckWARNregdep(loc,m) STMT_START {                                \
 612     const IV offset = loc - RExC_precomp;                               \
 613     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 614             m REPORT_LOCATION,                                          \
 615             REPORT_LOCATION_ARGS(offset));              \
 616 } STMT_END
 617
 618 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 619     const IV offset = loc - RExC_precomp;                               \
 620     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 621             m REPORT_LOCATION,                                          \
 622             a1, REPORT_LOCATION_ARGS(offset));  \
 623 } STMT_END
 624
 625 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 626     const IV offset = loc - RExC_precomp;                               \
 627     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 628             a1, REPORT_LOCATION_ARGS(offset));  \
 629 } STMT_END
 630
 631 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 632     const IV offset = loc - RExC_precomp;                               \
 633     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 634             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 635 } STMT_END
 636
 637 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 638     const IV offset = loc - RExC_precomp;                               \
 639     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 640             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 641 } STMT_END
 642
 643 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 644     const IV offset = loc - RExC_precomp;                               \
 645     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 646             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 647 } STMT_END
 648
 649 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 650     const IV offset = loc - RExC_precomp;                               \
 651     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 652             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 653 } STMT_END
 654
 655 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 656     const IV offset = loc - RExC_precomp;                               \
 657     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 658             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 659 } STMT_END
 660
 661
 662 /* Allow for side effects in s */
 663 #define REGC(c,s) STMT_START {                  \
 664     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 665 } STMT_END
 666
 667 /* Macros for recording node offsets.   20001227 mjd@plover.com
 668  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 669  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 670  * Element 0 holds the number n.
 671  * Position is 1 indexed.
 672  */
 673 #ifndef RE_TRACK_PATTERN_OFFSETS
 674 #define Set_Node_Offset_To_R(node,byte)
 675 #define Set_Node_Offset(node,byte)
 676 #define Set_Cur_Node_Offset
 677 #define Set_Node_Length_To_R(node,len)
 678 #define Set_Node_Length(node,len)
 679 #define Set_Node_Cur_Length(node,start)
 680 #define Node_Offset(n)
 681 #define Node_Length(n)
 682 #define Set_Node_Offset_Length(node,offset,len)
 683 #define ProgLen(ri) ri->u.proglen
 684 #define SetProgLen(ri,x) ri->u.proglen = x
 685 #else
 686 #define ProgLen(ri) ri->u.offsets[0]
 687 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 688 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 689     if (! SIZE_ONLY) {                                                  \
 690         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 691                     __LINE__, (int)(node), (int)(byte)));               \
 692         if((node) < 0) {                                                \
 693             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 694                                          (int)(node));                  \
 695         } else {                                                        \
 696             RExC_offsets[2*(node)-1] = (byte);                          \
 697         }                                                               \
 698     }                                                                   \
 699 } STMT_END
 700
 701 #define Set_Node_Offset(node,byte) \
 702     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 703 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 704
 705 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 706     if (! SIZE_ONLY) {                                                  \
 707         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 708                 __LINE__, (int)(node), (int)(len)));                    \
 709         if((node) < 0) {                                                \
 710             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 711                                          (int)(node));                  \
 712         } else {                                                        \
 713             RExC_offsets[2*(node)] = (len);                             \
 714         }                                                               \
 715     }                                                                   \
 716 } STMT_END
 717
 718 #define Set_Node_Length(node,len) \
 719     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 720 #define Set_Node_Cur_Length(node, start)                \
 721     Set_Node_Length(node, RExC_parse - start)
 722
 723 /* Get offsets and lengths */
 724 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 725 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 726
 727 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 728     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 729     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 730 } STMT_END
 731 #endif
 732
 733 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 734 #define EXPERIMENTAL_INPLACESCAN
 735 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 736
 737 #define DEBUG_RExC_seen() \
 738         DEBUG_OPTIMISE_MORE_r({                                             \
 739             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 740                                                                             \
 741             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 742                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 743                                                                             \
 744             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 745                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 746                                                                             \
 747             if (RExC_seen & REG_GPOS_SEEN)                                  \
 748                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 749                                                                             \
 750             if (RExC_seen & REG_CANY_SEEN)                                  \
 751                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 752                                                                             \
 753             if (RExC_seen & REG_RECURSE_SEEN)                               \
 754                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 755                                                                             \
 756             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 757                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 758                                                                             \
 759             if (RExC_seen & REG_VERBARG_SEEN)                               \
 760                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 761                                                                             \
 762             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 763                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 764                                                                             \
 765             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 766                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 767                                                                             \
 768             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 769                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 770                                                                             \
 771             if (RExC_seen & REG_GOSTART_SEEN)                               \
 772                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 773                                                                             \
 774             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 775                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 776                                                                             \
 777             PerlIO_printf(Perl_debug_log,"\n");                             \
 778         });
 779
 780 #define DEBUG_STUDYDATA(str,data,depth)                              \
 781 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 782     PerlIO_printf(Perl_debug_log,                                    \
 783         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 784         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 785         (int)(depth)*2, "",                                          \
 786         (IV)((data)->pos_min),                                       \
 787         (IV)((data)->pos_delta),                                     \
 788         (UV)((data)->flags),                                         \
 789         (IV)((data)->whilem_c),                                      \
 790         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 791         is_inf ? "INF " : ""                                         \
 792     );                                                               \
 793     if ((data)->last_found)                                          \
 794         PerlIO_printf(Perl_debug_log,                                \
 795             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 796             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 797             SvPVX_const((data)->last_found),                         \
 798             (IV)((data)->last_end),                                  \
 799             (IV)((data)->last_start_min),                            \
 800             (IV)((data)->last_start_max),                            \
 801             ((data)->longest &&                                      \
 802              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 803             SvPVX_const((data)->longest_fixed),                      \
 804             (IV)((data)->offset_fixed),                              \
 805             ((data)->longest &&                                      \
 806              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 807             SvPVX_const((data)->longest_float),                      \
 808             (IV)((data)->offset_float_min),                          \
 809             (IV)((data)->offset_float_max)                           \
 810         );                                                           \
 811     PerlIO_printf(Perl_debug_log,"\n");                              \
 812 });
 813
 814 #ifdef DEBUGGING
 815
 816 /* is c a control character for which we have a mnemonic? */
 817 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 818
 819 STATIC const char *
 820 S_cntrl_to_mnemonic(const U8 c)
 821 {
 822     /* Returns the mnemonic string that represents character 'c', if one
 823      * exists; NULL otherwise.  The only ones that exist for the purposes of
 824      * this routine are a few control characters */
 825
 826     switch (c) {
 827         case '\a':       return "\\a";
 828         case '\b':       return "\\b";
 829         case ESC_NATIVE: return "\\e";
 830         case '\f':       return "\\f";
 831         case '\n':       return "\\n";
 832         case '\r':       return "\\r";
 833         case '\t':       return "\\t";
 834     }
 835
 836     return NULL;
 837 }
 838
 839 #endif
 840
 841 /* Mark that we cannot extend a found fixed substring at this point.
 842    Update the longest found anchored substring and the longest found
 843    floating substrings if needed. */
 844
 845 STATIC void
 846 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 847                     SSize_t *minlenp, int is_inf)
 848 {
 849     const STRLEN l = CHR_SVLEN(data->last_found);
 850     const STRLEN old_l = CHR_SVLEN(*data->longest);
 851     GET_RE_DEBUG_FLAGS_DECL;
 852
 853     PERL_ARGS_ASSERT_SCAN_COMMIT;
 854
 855     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 856         SvSetMagicSV(*data->longest, data->last_found);
 857         if (*data->longest == data->longest_fixed) {
 858             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 859             if (data->flags & SF_BEFORE_EOL)
 860                 data->flags
 861                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 862             else
 863                 data->flags &= ~SF_FIX_BEFORE_EOL;
 864             data->minlen_fixed=minlenp;
 865             data->lookbehind_fixed=0;
 866         }
 867         else { /* *data->longest == data->longest_float */
 868             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 869             data->offset_float_max = (l
 870                                       ? data->last_start_max
 871                                       : (data->pos_delta == SSize_t_MAX
 872                                          ? SSize_t_MAX
 873                                          : data->pos_min + data->pos_delta));
 874             if (is_inf
 875                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 876                 data->offset_float_max = SSize_t_MAX;
 877             if (data->flags & SF_BEFORE_EOL)
 878                 data->flags
 879                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 880             else
 881                 data->flags &= ~SF_FL_BEFORE_EOL;
 882             data->minlen_float=minlenp;
 883             data->lookbehind_float=0;
 884         }
 885     }
 886     SvCUR_set(data->last_found, 0);
 887     {
 888         SV * const sv = data->last_found;
 889         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 890             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 891             if (mg)
 892                 mg->mg_len = 0;
 893         }
 894     }
 895     data->last_end = -1;
 896     data->flags &= ~SF_BEFORE_EOL;
 897     DEBUG_STUDYDATA("commit: ",data,0);
 898 }
 899
 900 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 901  * list that describes which code points it matches */
 902
 903 STATIC void
 904 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 905 {
 906     /* Set the SSC 'ssc' to match an empty string or any code point */
 907
 908     PERL_ARGS_ASSERT_SSC_ANYTHING;
 909
 910     assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 913     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 914     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 915 }
 916
 917 STATIC int
 918 S_ssc_is_anything(const regnode_ssc *ssc)
 919 {
 920     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 921      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 922      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 923      * in any way, so there's no point in using it */
 924
 925     UV start, end;
 926     bool ret;
 927
 928     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 929
 930     assert(is_ANYOF_SYNTHETIC(ssc));
 931
 932     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 933         return FALSE;
 934     }
 935
 936     /* See if the list consists solely of the range 0 - Infinity */
 937     invlist_iterinit(ssc->invlist);
 938     ret = invlist_iternext(ssc->invlist, &start, &end)
 939           && start == 0
 940           && end == UV_MAX;
 941
 942     invlist_iterfinish(ssc->invlist);
 943
 944     if (ret) {
 945         return TRUE;
 946     }
 947
 948     /* If e.g., both \w and \W are set, matches everything */
 949     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 950         int i;
 951         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 952             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 953                 return TRUE;
 954             }
 955         }
 956     }
 957
 958     return FALSE;
 959 }
 960
 961 STATIC void
 962 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 963 {
 964     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 965      * string, any code point, or any posix class under locale */
 966
 967     PERL_ARGS_ASSERT_SSC_INIT;
 968
 969     Zero(ssc, 1, regnode_ssc);
 970     set_ANYOF_SYNTHETIC(ssc);
 971     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
 972     ssc_anything(ssc);
 973
 974     /* If any portion of the regex is to operate under locale rules,
 975      * initialization includes it.  The reason this isn't done for all regexes
 976      * is that the optimizer was written under the assumption that locale was
 977      * all-or-nothing.  Given the complexity and lack of documentation in the
 978      * optimizer, and that there are inadequate test cases for locale, many
 979      * parts of it may not work properly, it is safest to avoid locale unless
 980      * necessary. */
 981     if (RExC_contains_locale) {
 982         ANYOF_POSIXL_SETALL(ssc);
 983     }
 984     else {
 985         ANYOF_POSIXL_ZERO(ssc);
 986     }
 987 }
 988
 989 STATIC int
 990 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 991                         const regnode_ssc *ssc)
 992 {
 993     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 994      * to the list of code points matched, and locale posix classes; hence does
 995      * not check its flags) */
 996
 997     UV start, end;
 998     bool ret;
 999
1000     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1001
1002     assert(is_ANYOF_SYNTHETIC(ssc));
1003
1004     invlist_iterinit(ssc->invlist);
1005     ret = invlist_iternext(ssc->invlist, &start, &end)
1006           && start == 0
1007           && end == UV_MAX;
1008
1009     invlist_iterfinish(ssc->invlist);
1010
1011     if (! ret) {
1012         return FALSE;
1013     }
1014
1015     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1016         return FALSE;
1017     }
1018
1019     return TRUE;
1020 }
1021
1022 STATIC SV*
1023 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1024                                const regnode_charclass* const node)
1025 {
1026     /* Returns a mortal inversion list defining which code points are matched
1027      * by 'node', which is of type ANYOF.  Handles complementing the result if
1028      * appropriate.  If some code points aren't knowable at this time, the
1029      * returned list must, and will, contain every code point that is a
1030      * possibility. */
1031
1032     SV* invlist = sv_2mortal(_new_invlist(0));
1033     SV* only_utf8_locale_invlist = NULL;
1034     unsigned int i;
1035     const U32 n = ARG(node);
1036     bool new_node_has_latin1 = FALSE;
1037
1038     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1039
1040     /* Look at the data structure created by S_set_ANYOF_arg() */
1041     if (n != ANYOF_ONLY_HAS_BITMAP) {
1042         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1043         AV * const av = MUTABLE_AV(SvRV(rv));
1044         SV **const ary = AvARRAY(av);
1045         assert(RExC_rxi->data->what[n] == 's');
1046
1047         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1048             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1049         }
1050         else if (ary[0] && ary[0] != &PL_sv_undef) {
1051
1052             /* Here, no compile-time swash, and there are things that won't be
1053              * known until runtime -- we have to assume it could be anything */
1054             return _add_range_to_invlist(invlist, 0, UV_MAX);
1055         }
1056         else if (ary[3] && ary[3] != &PL_sv_undef) {
1057
1058             /* Here no compile-time swash, and no run-time only data.  Use the
1059              * node's inversion list */
1060             invlist = sv_2mortal(invlist_clone(ary[3]));
1061         }
1062
1063         /* Get the code points valid only under UTF-8 locales */
1064         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1065             && ary[2] && ary[2] != &PL_sv_undef)
1066         {
1067             only_utf8_locale_invlist = ary[2];
1068         }
1069     }
1070
1071     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1072      * code points, and an inversion list for the others, but if there are code
1073      * points that should match only conditionally on the target string being
1074      * UTF-8, those are placed in the inversion list, and not the bitmap.
1075      * Since there are circumstances under which they could match, they are
1076      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1077      * to exclude them here, so that when we invert below, the end result
1078      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1079      * have to do this here before we add the unconditionally matched code
1080      * points */
1081     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1082         _invlist_intersection_complement_2nd(invlist,
1083                                              PL_UpperLatin1,
1084                                              &invlist);
1085     }
1086
1087     /* Add in the points from the bit map */
1088     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1089         if (ANYOF_BITMAP_TEST(node, i)) {
1090             invlist = add_cp_to_invlist(invlist, i);
1091             new_node_has_latin1 = TRUE;
1092         }
1093     }
1094
1095     /* If this can match all upper Latin1 code points, have to add them
1096      * as well */
1097     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1098         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1099     }
1100
1101     /* Similarly for these */
1102     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1103         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1104     }
1105
1106     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1107         _invlist_invert(invlist);
1108     }
1109     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1110
1111         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1112          * locale.  We can skip this if there are no 0-255 at all. */
1113         _invlist_union(invlist, PL_Latin1, &invlist);
1114     }
1115
1116     /* Similarly add the UTF-8 locale possible matches.  These have to be
1117      * deferred until after the non-UTF-8 locale ones are taken care of just
1118      * above, or it leads to wrong results under ANYOF_INVERT */
1119     if (only_utf8_locale_invlist) {
1120         _invlist_union_maybe_complement_2nd(invlist,
1121                                             only_utf8_locale_invlist,
1122                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1123                                             &invlist);
1124     }
1125
1126     return invlist;
1127 }
1128
1129 /* These two functions currently do the exact same thing */
1130 #define ssc_init_zero           ssc_init
1131
1132 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1133 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1134
1135 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1136  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1137  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1138
1139 STATIC void
1140 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1141                 const regnode_charclass *and_with)
1142 {
1143     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1144      * another SSC or a regular ANYOF class.  Can create false positives. */
1145
1146     SV* anded_cp_list;
1147     U8  anded_flags;
1148
1149     PERL_ARGS_ASSERT_SSC_AND;
1150
1151     assert(is_ANYOF_SYNTHETIC(ssc));
1152
1153     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1154      * the code point inversion list and just the relevant flags */
1155     if (is_ANYOF_SYNTHETIC(and_with)) {
1156         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1157         anded_flags = ANYOF_FLAGS(and_with);
1158
1159         /* XXX This is a kludge around what appears to be deficiencies in the
1160          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1161          * there are paths through the optimizer where it doesn't get weeded
1162          * out when it should.  And if we don't make some extra provision for
1163          * it like the code just below, it doesn't get added when it should.
1164          * This solution is to add it only when AND'ing, which is here, and
1165          * only when what is being AND'ed is the pristine, original node
1166          * matching anything.  Thus it is like adding it to ssc_anything() but
1167          * only when the result is to be AND'ed.  Probably the same solution
1168          * could be adopted for the same problem we have with /l matching,
1169          * which is solved differently in S_ssc_init(), and that would lead to
1170          * fewer false positives than that solution has.  But if this solution
1171          * creates bugs, the consequences are only that a warning isn't raised
1172          * that should be; while the consequences for having /l bugs is
1173          * incorrect matches */
1174         if (ssc_is_anything((regnode_ssc *)and_with)) {
1175             anded_flags |= ANYOF_WARN_SUPER;
1176         }
1177     }
1178     else {
1179         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1180         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1181     }
1182
1183     ANYOF_FLAGS(ssc) &= anded_flags;
1184
1185     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1186      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1187      * 'and_with' may be inverted.  When not inverted, we have the situation of
1188      * computing:
1189      *  (C1 | P1) & (C2 | P2)
1190      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1191      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1192      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1193      *                    <=  ((C1 & C2) | P1 | P2)
1194      * Alternatively, the last few steps could be:
1195      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1196      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1197      *                    <=  (C1 | C2 | (P1 & P2))
1198      * We favor the second approach if either P1 or P2 is non-empty.  This is
1199      * because these components are a barrier to doing optimizations, as what
1200      * they match cannot be known until the moment of matching as they are
1201      * dependent on the current locale, 'AND"ing them likely will reduce or
1202      * eliminate them.
1203      * But we can do better if we know that C1,P1 are in their initial state (a
1204      * frequent occurrence), each matching everything:
1205      *  (<everything>) & (C2 | P2) =  C2 | P2
1206      * Similarly, if C2,P2 are in their initial state (again a frequent
1207      * occurrence), the result is a no-op
1208      *  (C1 | P1) & (<everything>) =  C1 | P1
1209      *
1210      * Inverted, we have
1211      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1212      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1213      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1214      * */
1215
1216     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1217         && ! is_ANYOF_SYNTHETIC(and_with))
1218     {
1219         unsigned int i;
1220
1221         ssc_intersection(ssc,
1222                          anded_cp_list,
1223                          FALSE /* Has already been inverted */
1224                          );
1225
1226         /* If either P1 or P2 is empty, the intersection will be also; can skip
1227          * the loop */
1228         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1229             ANYOF_POSIXL_ZERO(ssc);
1230         }
1231         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1232
1233             /* Note that the Posix class component P from 'and_with' actually
1234              * looks like:
1235              *      P = Pa | Pb | ... | Pn
1236              * where each component is one posix class, such as in [\w\s].
1237              * Thus
1238              *      ~P = ~(Pa | Pb | ... | Pn)
1239              *         = ~Pa & ~Pb & ... & ~Pn
1240              *        <= ~Pa | ~Pb | ... | ~Pn
1241              * The last is something we can easily calculate, but unfortunately
1242              * is likely to have many false positives.  We could do better
1243              * in some (but certainly not all) instances if two classes in
1244              * P have known relationships.  For example
1245              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1246              * So
1247              *      :lower: & :print: = :lower:
1248              * And similarly for classes that must be disjoint.  For example,
1249              * since \s and \w can have no elements in common based on rules in
1250              * the POSIX standard,
1251              *      \w & ^\S = nothing
1252              * Unfortunately, some vendor locales do not meet the Posix
1253              * standard, in particular almost everything by Microsoft.
1254              * The loop below just changes e.g., \w into \W and vice versa */
1255
1256             regnode_charclass_posixl temp;
1257             int add = 1;    /* To calculate the index of the complement */
1258
1259             ANYOF_POSIXL_ZERO(&temp);
1260             for (i = 0; i < ANYOF_MAX; i++) {
1261                 assert(i % 2 != 0
1262                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1263                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1264
1265                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1266                     ANYOF_POSIXL_SET(&temp, i + add);
1267                 }
1268                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1269             }
1270             ANYOF_POSIXL_AND(&temp, ssc);
1271
1272         } /* else ssc already has no posixes */
1273     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1274          in its initial state */
1275     else if (! is_ANYOF_SYNTHETIC(and_with)
1276              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1277     {
1278         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1279          * copy it over 'ssc' */
1280         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1281             if (is_ANYOF_SYNTHETIC(and_with)) {
1282                 StructCopy(and_with, ssc, regnode_ssc);
1283             }
1284             else {
1285                 ssc->invlist = anded_cp_list;
1286                 ANYOF_POSIXL_ZERO(ssc);
1287                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1288                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1289                 }
1290             }
1291         }
1292         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1293                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1294         {
1295             /* One or the other of P1, P2 is non-empty. */
1296             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1297                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1298             }
1299             ssc_union(ssc, anded_cp_list, FALSE);
1300         }
1301         else { /* P1 = P2 = empty */
1302             ssc_intersection(ssc, anded_cp_list, FALSE);
1303         }
1304     }
1305 }
1306
1307 STATIC void
1308 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1309                const regnode_charclass *or_with)
1310 {
1311     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1312      * another SSC or a regular ANYOF class.  Can create false positives if
1313      * 'or_with' is to be inverted. */
1314
1315     SV* ored_cp_list;
1316     U8 ored_flags;
1317
1318     PERL_ARGS_ASSERT_SSC_OR;
1319
1320     assert(is_ANYOF_SYNTHETIC(ssc));
1321
1322     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1323      * the code point inversion list and just the relevant flags */
1324     if (is_ANYOF_SYNTHETIC(or_with)) {
1325         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1326         ored_flags = ANYOF_FLAGS(or_with);
1327     }
1328     else {
1329         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1330         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1331     }
1332
1333     ANYOF_FLAGS(ssc) |= ored_flags;
1334
1335     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1336      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1337      * 'or_with' may be inverted.  When not inverted, we have the simple
1338      * situation of computing:
1339      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1340      * If P1|P2 yields a situation with both a class and its complement are
1341      * set, like having both \w and \W, this matches all code points, and we
1342      * can delete these from the P component of the ssc going forward.  XXX We
1343      * might be able to delete all the P components, but I (khw) am not certain
1344      * about this, and it is better to be safe.
1345      *
1346      * Inverted, we have
1347      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1348      *                         <=  (C1 | P1) | ~C2
1349      *                         <=  (C1 | ~C2) | P1
1350      * (which results in actually simpler code than the non-inverted case)
1351      * */
1352
1353     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1354         && ! is_ANYOF_SYNTHETIC(or_with))
1355     {
1356         /* We ignore P2, leaving P1 going forward */
1357     }   /* else  Not inverted */
1358     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1359         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1360         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1361             unsigned int i;
1362             for (i = 0; i < ANYOF_MAX; i += 2) {
1363                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1364                 {
1365                     ssc_match_all_cp(ssc);
1366                     ANYOF_POSIXL_CLEAR(ssc, i);
1367                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1368                 }
1369             }
1370         }
1371     }
1372
1373     ssc_union(ssc,
1374               ored_cp_list,
1375               FALSE /* Already has been inverted */
1376               );
1377 }
1378
1379 PERL_STATIC_INLINE void
1380 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1381 {
1382     PERL_ARGS_ASSERT_SSC_UNION;
1383
1384     assert(is_ANYOF_SYNTHETIC(ssc));
1385
1386     _invlist_union_maybe_complement_2nd(ssc->invlist,
1387                                         invlist,
1388                                         invert2nd,
1389                                         &ssc->invlist);
1390 }
1391
1392 PERL_STATIC_INLINE void
1393 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1394                          SV* const invlist,
1395                          const bool invert2nd)
1396 {
1397     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1398
1399     assert(is_ANYOF_SYNTHETIC(ssc));
1400
1401     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1402                                                invlist,
1403                                                invert2nd,
1404                                                &ssc->invlist);
1405 }
1406
1407 PERL_STATIC_INLINE void
1408 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1409 {
1410     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1411
1412     assert(is_ANYOF_SYNTHETIC(ssc));
1413
1414     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1415 }
1416
1417 PERL_STATIC_INLINE void
1418 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1419 {
1420     /* AND just the single code point 'cp' into the SSC 'ssc' */
1421
1422     SV* cp_list = _new_invlist(2);
1423
1424     PERL_ARGS_ASSERT_SSC_CP_AND;
1425
1426     assert(is_ANYOF_SYNTHETIC(ssc));
1427
1428     cp_list = add_cp_to_invlist(cp_list, cp);
1429     ssc_intersection(ssc, cp_list,
1430                      FALSE /* Not inverted */
1431                      );
1432     SvREFCNT_dec_NN(cp_list);
1433 }
1434
1435 PERL_STATIC_INLINE void
1436 S_ssc_clear_locale(regnode_ssc *ssc)
1437 {
1438     /* Set the SSC 'ssc' to not match any locale things */
1439     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1440
1441     assert(is_ANYOF_SYNTHETIC(ssc));
1442
1443     ANYOF_POSIXL_ZERO(ssc);
1444     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1445 }
1446
1447 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1448
1449 STATIC bool
1450 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1451 {
1452     /* The synthetic start class is used to hopefully quickly winnow down
1453      * places where a pattern could start a match in the target string.  If it
1454      * doesn't really narrow things down that much, there isn't much point to
1455      * having the overhead of using it.  This function uses some very crude
1456      * heuristics to decide if to use the ssc or not.
1457      *
1458      * It returns TRUE if 'ssc' rules out more than half what it considers to
1459      * be the "likely" possible matches, but of course it doesn't know what the
1460      * actual things being matched are going to be; these are only guesses
1461      *
1462      * For /l matches, it assumes that the only likely matches are going to be
1463      *      in the 0-255 range, uniformly distributed, so half of that is 127
1464      * For /a and /d matches, it assumes that the likely matches will be just
1465      *      the ASCII range, so half of that is 63
1466      * For /u and there isn't anything matching above the Latin1 range, it
1467      *      assumes that that is the only range likely to be matched, and uses
1468      *      half that as the cut-off: 127.  If anything matches above Latin1,
1469      *      it assumes that all of Unicode could match (uniformly), except for
1470      *      non-Unicode code points and things in the General Category "Other"
1471      *      (unassigned, private use, surrogates, controls and formats).  This
1472      *      is a much large number. */
1473
1474     const U32 max_match = (LOC)
1475                           ? 127
1476                           : (! UNI_SEMANTICS)
1477                             ? 63
1478                             : (invlist_highest(ssc->invlist) < 256)
1479                               ? 127
1480                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1481     U32 count = 0;      /* Running total of number of code points matched by
1482                            'ssc' */
1483     UV start, end;      /* Start and end points of current range in inversion
1484                            list */
1485
1486     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1487
1488     invlist_iterinit(ssc->invlist);
1489     while (invlist_iternext(ssc->invlist, &start, &end)) {
1490
1491         /* /u is the only thing that we expect to match above 255; so if not /u
1492          * and even if there are matches above 255, ignore them.  This catches
1493          * things like \d under /d which does match the digits above 255, but
1494          * since the pattern is /d, it is not likely to be expecting them */
1495         if (! UNI_SEMANTICS) {
1496             if (start > 255) {
1497                 break;
1498             }
1499             end = MIN(end, 255);
1500         }
1501         count += end - start + 1;
1502         if (count > max_match) {
1503             invlist_iterfinish(ssc->invlist);
1504             return FALSE;
1505         }
1506     }
1507
1508     return TRUE;
1509 }
1510
1511
1512 STATIC void
1513 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1514 {
1515     /* The inversion list in the SSC is marked mortal; now we need a more
1516      * permanent copy, which is stored the same way that is done in a regular
1517      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1518      * map */
1519
1520     SV* invlist = invlist_clone(ssc->invlist);
1521
1522     PERL_ARGS_ASSERT_SSC_FINALIZE;
1523
1524     assert(is_ANYOF_SYNTHETIC(ssc));
1525
1526     /* The code in this file assumes that all but these flags aren't relevant
1527      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1528      * by the time we reach here */
1529     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1530
1531     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1532
1533     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1534                                 NULL, NULL, NULL, FALSE);
1535
1536     /* Make sure is clone-safe */
1537     ssc->invlist = NULL;
1538
1539     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1540         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1541     }
1542
1543     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1544 }
1545
1546 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1547 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1548 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1549 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1550                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1551                                : 0 )
1552
1553
1554 #ifdef DEBUGGING
1555 /*
1556    dump_trie(trie,widecharmap,revcharmap)
1557    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1558    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1559
1560    These routines dump out a trie in a somewhat readable format.
1561    The _interim_ variants are used for debugging the interim
1562    tables that are used to generate the final compressed
1563    representation which is what dump_trie expects.
1564
1565    Part of the reason for their existence is to provide a form
1566    of documentation as to how the different representations function.
1567
1568 */
1569
1570 /*
1571   Dumps the final compressed table form of the trie to Perl_debug_log.
1572   Used for debugging make_trie().
1573 */
1574
1575 STATIC void
1576 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1577             AV *revcharmap, U32 depth)
1578 {
1579     U32 state;
1580     SV *sv=sv_newmortal();
1581     int colwidth= widecharmap ? 6 : 4;
1582     U16 word;
1583     GET_RE_DEBUG_FLAGS_DECL;
1584
1585     PERL_ARGS_ASSERT_DUMP_TRIE;
1586
1587     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1588         (int)depth * 2 + 2,"",
1589         "Match","Base","Ofs" );
1590
1591     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1592         SV ** const tmp = av_fetch( revcharmap, state, 0);
1593         if ( tmp ) {
1594             PerlIO_printf( Perl_debug_log, "%*s",
1595                 colwidth,
1596                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1597                             PL_colors[0], PL_colors[1],
1598                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1599                             PERL_PV_ESCAPE_FIRSTCHAR
1600                 )
1601             );
1602         }
1603     }
1604     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1605         (int)depth * 2 + 2,"");
1606
1607     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1608         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1609     PerlIO_printf( Perl_debug_log, "\n");
1610
1611     for( state = 1 ; state < trie->statecount ; state++ ) {
1612         const U32 base = trie->states[ state ].trans.base;
1613
1614         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1615                                        (int)depth * 2 + 2,"", (UV)state);
1616
1617         if ( trie->states[ state ].wordnum ) {
1618             PerlIO_printf( Perl_debug_log, " W%4X",
1619                                            trie->states[ state ].wordnum );
1620         } else {
1621             PerlIO_printf( Perl_debug_log, "%6s", "" );
1622         }
1623
1624         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1625
1626         if ( base ) {
1627             U32 ofs = 0;
1628
1629             while( ( base + ofs  < trie->uniquecharcount ) ||
1630                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1631                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1632                                                                     != state))
1633                     ofs++;
1634
1635             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1636
1637             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1638                 if ( ( base + ofs >= trie->uniquecharcount )
1639                         && ( base + ofs - trie->uniquecharcount
1640                                                         < trie->lasttrans )
1641                         && trie->trans[ base + ofs
1642                                     - trie->uniquecharcount ].check == state )
1643                 {
1644                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1645                     colwidth,
1646                     (UV)trie->trans[ base + ofs
1647                                              - trie->uniquecharcount ].next );
1648                 } else {
1649                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1650                 }
1651             }
1652
1653             PerlIO_printf( Perl_debug_log, "]");
1654
1655         }
1656         PerlIO_printf( Perl_debug_log, "\n" );
1657     }
1658     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1659                                 (int)depth*2, "");
1660     for (word=1; word <= trie->wordcount; word++) {
1661         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1662             (int)word, (int)(trie->wordinfo[word].prev),
1663             (int)(trie->wordinfo[word].len));
1664     }
1665     PerlIO_printf(Perl_debug_log, "\n" );
1666 }
1667 /*
1668   Dumps a fully constructed but uncompressed trie in list form.
1669   List tries normally only are used for construction when the number of
1670   possible chars (trie->uniquecharcount) is very high.
1671   Used for debugging make_trie().
1672 */
1673 STATIC void
1674 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1675                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1676                          U32 depth)
1677 {
1678     U32 state;
1679     SV *sv=sv_newmortal();
1680     int colwidth= widecharmap ? 6 : 4;
1681     GET_RE_DEBUG_FLAGS_DECL;
1682
1683     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1684
1685     /* print out the table precompression.  */
1686     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1687         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1688         "------:-----+-----------------\n" );
1689
1690     for( state=1 ; state < next_alloc ; state ++ ) {
1691         U16 charid;
1692
1693         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1694             (int)depth * 2 + 2,"", (UV)state  );
1695         if ( ! trie->states[ state ].wordnum ) {
1696             PerlIO_printf( Perl_debug_log, "%5s| ","");
1697         } else {
1698             PerlIO_printf( Perl_debug_log, "W%4x| ",
1699                 trie->states[ state ].wordnum
1700             );
1701         }
1702         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1703             SV ** const tmp = av_fetch( revcharmap,
1704                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1705             if ( tmp ) {
1706                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1707                     colwidth,
1708                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1709                               colwidth,
1710                               PL_colors[0], PL_colors[1],
1711                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1712                               | PERL_PV_ESCAPE_FIRSTCHAR
1713                     ) ,
1714                     TRIE_LIST_ITEM(state,charid).forid,
1715                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1716                 );
1717                 if (!(charid % 10))
1718                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1719                         (int)((depth * 2) + 14), "");
1720             }
1721         }
1722         PerlIO_printf( Perl_debug_log, "\n");
1723     }
1724 }
1725
1726 /*
1727   Dumps a fully constructed but uncompressed trie in table form.
1728   This is the normal DFA style state transition table, with a few
1729   twists to facilitate compression later.
1730   Used for debugging make_trie().
1731 */
1732 STATIC void
1733 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1734                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1735                           U32 depth)
1736 {
1737     U32 state;
1738     U16 charid;
1739     SV *sv=sv_newmortal();
1740     int colwidth= widecharmap ? 6 : 4;
1741     GET_RE_DEBUG_FLAGS_DECL;
1742
1743     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1744
1745     /*
1746        print out the table precompression so that we can do a visual check
1747        that they are identical.
1748      */
1749
1750     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1751
1752     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1753         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1754         if ( tmp ) {
1755             PerlIO_printf( Perl_debug_log, "%*s",
1756                 colwidth,
1757                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1758                             PL_colors[0], PL_colors[1],
1759                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1760                             PERL_PV_ESCAPE_FIRSTCHAR
1761                 )
1762             );
1763         }
1764     }
1765
1766     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1767
1768     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1769         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1770     }
1771
1772     PerlIO_printf( Perl_debug_log, "\n" );
1773
1774     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1775
1776         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1777             (int)depth * 2 + 2,"",
1778             (UV)TRIE_NODENUM( state ) );
1779
1780         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1781             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1782             if (v)
1783                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1784             else
1785                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1786         }
1787         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1788             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1789                                             (UV)trie->trans[ state ].check );
1790         } else {
1791             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1792                                             (UV)trie->trans[ state ].check,
1793             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1794         }
1795     }
1796 }
1797
1798 #endif
1799
1800
1801 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1802   startbranch: the first branch in the whole branch sequence
1803   first      : start branch of sequence of branch-exact nodes.
1804                May be the same as startbranch
1805   last       : Thing following the last branch.
1806                May be the same as tail.
1807   tail       : item following the branch sequence
1808   count      : words in the sequence
1809   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1810   depth      : indent depth
1811
1812 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1813
1814 A trie is an N'ary tree where the branches are determined by digital
1815 decomposition of the key. IE, at the root node you look up the 1st character and
1816 follow that branch repeat until you find the end of the branches. Nodes can be
1817 marked as "accepting" meaning they represent a complete word. Eg:
1818
1819   /he|she|his|hers/
1820
1821 would convert into the following structure. Numbers represent states, letters
1822 following numbers represent valid transitions on the letter from that state, if
1823 the number is in square brackets it represents an accepting state, otherwise it
1824 will be in parenthesis.
1825
1826       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1827       |    |
1828       |   (2)
1829       |    |
1830      (1)   +-i->(6)-+-s->[7]
1831       |
1832       +-s->(3)-+-h->(4)-+-e->[5]
1833
1834       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1835
1836 This shows that when matching against the string 'hers' we will begin at state 1
1837 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1838 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1839 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1840 single traverse. We store a mapping from accepting to state to which word was
1841 matched, and then when we have multiple possibilities we try to complete the
1842 rest of the regex in the order in which they occured in the alternation.
1843
1844 The only prior NFA like behaviour that would be changed by the TRIE support is
1845 the silent ignoring of duplicate alternations which are of the form:
1846
1847  / (DUPE|DUPE) X? (?{ ... }) Y /x
1848
1849 Thus EVAL blocks following a trie may be called a different number of times with
1850 and without the optimisation. With the optimisations dupes will be silently
1851 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1852 the following demonstrates:
1853
1854  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1855
1856 which prints out 'word' three times, but
1857
1858  'words'=~/(word|word|word)(?{ print $1 })S/
1859
1860 which doesnt print it out at all. This is due to other optimisations kicking in.
1861
1862 Example of what happens on a structural level:
1863
1864 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1865
1866    1: CURLYM[1] {1,32767}(18)
1867    5:   BRANCH(8)
1868    6:     EXACT <ac>(16)
1869    8:   BRANCH(11)
1870    9:     EXACT <ad>(16)
1871   11:   BRANCH(14)
1872   12:     EXACT <ab>(16)
1873   16:   SUCCEED(0)
1874   17:   NOTHING(18)
1875   18: END(0)
1876
1877 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1878 and should turn into:
1879
1880    1: CURLYM[1] {1,32767}(18)
1881    5:   TRIE(16)
1882         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1883           <ac>
1884           <ad>
1885           <ab>
1886   16:   SUCCEED(0)
1887   17:   NOTHING(18)
1888   18: END(0)
1889
1890 Cases where tail != last would be like /(?foo|bar)baz/:
1891
1892    1: BRANCH(4)
1893    2:   EXACT <foo>(8)
1894    4: BRANCH(7)
1895    5:   EXACT <bar>(8)
1896    7: TAIL(8)
1897    8: EXACT <baz>(10)
1898   10: END(0)
1899
1900 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1901 and would end up looking like:
1902
1903     1: TRIE(8)
1904       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1905         <foo>
1906         <bar>
1907    7: TAIL(8)
1908    8: EXACT <baz>(10)
1909   10: END(0)
1910
1911     d = uvchr_to_utf8_flags(d, uv, 0);
1912
1913 is the recommended Unicode-aware way of saying
1914
1915     *(d++) = uv;
1916 */
1917
1918 #define TRIE_STORE_REVCHAR(val)                                            \
1919     STMT_START {                                                           \
1920         if (UTF) {                                                         \
1921             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1922             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1923             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1924             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1925             SvPOK_on(zlopp);                                               \
1926             SvUTF8_on(zlopp);                                              \
1927             av_push(revcharmap, zlopp);                                    \
1928         } else {                                                           \
1929             char ooooff = (char)val;                                           \
1930             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1931         }                                                                  \
1932         } STMT_END
1933
1934 /* This gets the next character from the input, folding it if not already
1935  * folded. */
1936 #define TRIE_READ_CHAR STMT_START {                                           \
1937     wordlen++;                                                                \
1938     if ( UTF ) {                                                              \
1939         /* if it is UTF then it is either already folded, or does not need    \
1940          * folding */                                                         \
1941         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1942     }                                                                         \
1943     else if (folder == PL_fold_latin1) {                                      \
1944         /* This folder implies Unicode rules, which in the range expressible  \
1945          *  by not UTF is the lower case, with the two exceptions, one of     \
1946          *  which should have been taken care of before calling this */       \
1947         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1948         uvc = toLOWER_L1(*uc);                                                \
1949         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1950         len = 1;                                                              \
1951     } else {                                                                  \
1952         /* raw data, will be folded later if needed */                        \
1953         uvc = (U32)*uc;                                                       \
1954         len = 1;                                                              \
1955     }                                                                         \
1956 } STMT_END
1957
1958
1959
1960 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1961     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1962         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1963         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1964     }                                                           \
1965     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1966     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1967     TRIE_LIST_CUR( state )++;                                   \
1968 } STMT_END
1969
1970 #define TRIE_LIST_NEW(state) STMT_START {                       \
1971     Newxz( trie->states[ state ].trans.list,               \
1972         4, reg_trie_trans_le );                                 \
1973      TRIE_LIST_CUR( state ) = 1;                                \
1974      TRIE_LIST_LEN( state ) = 4;                                \
1975 } STMT_END
1976
1977 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1978     U16 dupe= trie->states[ state ].wordnum;                    \
1979     regnode * const noper_next = regnext( noper );              \
1980                                                                 \
1981     DEBUG_r({                                                   \
1982         /* store the word for dumping */                        \
1983         SV* tmp;                                                \
1984         if (OP(noper) != NOTHING)                               \
1985             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1986         else                                                    \
1987             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1988         av_push( trie_words, tmp );                             \
1989     });                                                         \
1990                                                                 \
1991     curword++;                                                  \
1992     trie->wordinfo[curword].prev   = 0;                         \
1993     trie->wordinfo[curword].len    = wordlen;                   \
1994     trie->wordinfo[curword].accept = state;                     \
1995                                                                 \
1996     if ( noper_next < tail ) {                                  \
1997         if (!trie->jump)                                        \
1998             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1999                                                  sizeof(U16) ); \
2000         trie->jump[curword] = (U16)(noper_next - convert);      \
2001         if (!jumper)                                            \
2002             jumper = noper_next;                                \
2003         if (!nextbranch)                                        \
2004             nextbranch= regnext(cur);                           \
2005     }                                                           \
2006                                                                 \
2007     if ( dupe ) {                                               \
2008         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2009         /* chain, so that when the bits of chain are later    */\
2010         /* linked together, the dups appear in the chain      */\
2011         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2012         trie->wordinfo[dupe].prev = curword;                    \
2013     } else {                                                    \
2014         /* we haven't inserted this word yet.                */ \
2015         trie->states[ state ].wordnum = curword;                \
2016     }                                                           \
2017 } STMT_END
2018
2019
2020 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2021      ( ( base + charid >=  ucharcount                                   \
2022          && base + charid < ubound                                      \
2023          && state == trie->trans[ base - ucharcount + charid ].check    \
2024          && trie->trans[ base - ucharcount + charid ].next )            \
2025            ? trie->trans[ base - ucharcount + charid ].next             \
2026            : ( state==1 ? special : 0 )                                 \
2027       )
2028
2029 #define MADE_TRIE       1
2030 #define MADE_JUMP_TRIE  2
2031 #define MADE_EXACT_TRIE 4
2032
2033 STATIC I32
2034 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2035                   regnode *first, regnode *last, regnode *tail,
2036                   U32 word_count, U32 flags, U32 depth)
2037 {
2038     /* first pass, loop through and scan words */
2039     reg_trie_data *trie;
2040     HV *widecharmap = NULL;
2041     AV *revcharmap = newAV();
2042     regnode *cur;
2043     STRLEN len = 0;
2044     UV uvc = 0;
2045     U16 curword = 0;
2046     U32 next_alloc = 0;
2047     regnode *jumper = NULL;
2048     regnode *nextbranch = NULL;
2049     regnode *convert = NULL;
2050     U32 *prev_states; /* temp array mapping each state to previous one */
2051     /* we just use folder as a flag in utf8 */
2052     const U8 * folder = NULL;
2053
2054 #ifdef DEBUGGING
2055     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2056     AV *trie_words = NULL;
2057     /* along with revcharmap, this only used during construction but both are
2058      * useful during debugging so we store them in the struct when debugging.
2059      */
2060 #else
2061     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2062     STRLEN trie_charcount=0;
2063 #endif
2064     SV *re_trie_maxbuff;
2065     GET_RE_DEBUG_FLAGS_DECL;
2066
2067     PERL_ARGS_ASSERT_MAKE_TRIE;
2068 #ifndef DEBUGGING
2069     PERL_UNUSED_ARG(depth);
2070 #endif
2071
2072     switch (flags) {
2073         case EXACT: break;
2074         case EXACTFA:
2075         case EXACTFU_SS:
2076         case EXACTFU: folder = PL_fold_latin1; break;
2077         case EXACTF:  folder = PL_fold; break;
2078         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2079     }
2080
2081     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2082     trie->refcount = 1;
2083     trie->startstate = 1;
2084     trie->wordcount = word_count;
2085     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2086     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2087     if (flags == EXACT)
2088         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2089     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2090                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2091
2092     DEBUG_r({
2093         trie_words = newAV();
2094     });
2095
2096     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2097     assert(re_trie_maxbuff);
2098     if (!SvIOK(re_trie_maxbuff)) {
2099         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2100     }
2101     DEBUG_TRIE_COMPILE_r({
2102         PerlIO_printf( Perl_debug_log,
2103           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2104           (int)depth * 2 + 2, "",
2105           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2106           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2107     });
2108
2109    /* Find the node we are going to overwrite */
2110     if ( first == startbranch && OP( last ) != BRANCH ) {
2111         /* whole branch chain */
2112         convert = first;
2113     } else {
2114         /* branch sub-chain */
2115         convert = NEXTOPER( first );
2116     }
2117
2118     /*  -- First loop and Setup --
2119
2120        We first traverse the branches and scan each word to determine if it
2121        contains widechars, and how many unique chars there are, this is
2122        important as we have to build a table with at least as many columns as we
2123        have unique chars.
2124
2125        We use an array of integers to represent the character codes 0..255
2126        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2127        the native representation of the character value as the key and IV's for
2128        the coded index.
2129
2130        *TODO* If we keep track of how many times each character is used we can
2131        remap the columns so that the table compression later on is more
2132        efficient in terms of memory by ensuring the most common value is in the
2133        middle and the least common are on the outside.  IMO this would be better
2134        than a most to least common mapping as theres a decent chance the most
2135        common letter will share a node with the least common, meaning the node
2136        will not be compressible. With a middle is most common approach the worst
2137        case is when we have the least common nodes twice.
2138
2139      */
2140
2141     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2142         regnode *noper = NEXTOPER( cur );
2143         const U8 *uc = (U8*)STRING( noper );
2144         const U8 *e  = uc + STR_LEN( noper );
2145         int foldlen = 0;
2146         U32 wordlen      = 0;         /* required init */
2147         STRLEN minchars = 0;
2148         STRLEN maxchars = 0;
2149         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2150                                                bitmap?*/
2151
2152         if (OP(noper) == NOTHING) {
2153             regnode *noper_next= regnext(noper);
2154             if (noper_next != tail && OP(noper_next) == flags) {
2155                 noper = noper_next;
2156                 uc= (U8*)STRING(noper);
2157                 e= uc + STR_LEN(noper);
2158                 trie->minlen= STR_LEN(noper);
2159             } else {
2160                 trie->minlen= 0;
2161                 continue;
2162             }
2163         }
2164
2165         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2166             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2167                                           regardless of encoding */
2168             if (OP( noper ) == EXACTFU_SS) {
2169                 /* false positives are ok, so just set this */
2170                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2171             }
2172         }
2173         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2174                                            branch */
2175             TRIE_CHARCOUNT(trie)++;
2176             TRIE_READ_CHAR;
2177
2178             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2179              * is in effect.  Under /i, this character can match itself, or
2180              * anything that folds to it.  If not under /i, it can match just
2181              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2182              * all fold to k, and all are single characters.   But some folds
2183              * expand to more than one character, so for example LATIN SMALL
2184              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2185              * the string beginning at 'uc' is 'ffi', it could be matched by
2186              * three characters, or just by the one ligature character. (It
2187              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2188              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2189              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2190              * match.)  The trie needs to know the minimum and maximum number
2191              * of characters that could match so that it can use size alone to
2192              * quickly reject many match attempts.  The max is simple: it is
2193              * the number of folded characters in this branch (since a fold is
2194              * never shorter than what folds to it. */
2195
2196             maxchars++;
2197
2198             /* And the min is equal to the max if not under /i (indicated by
2199              * 'folder' being NULL), or there are no multi-character folds.  If
2200              * there is a multi-character fold, the min is incremented just
2201              * once, for the character that folds to the sequence.  Each
2202              * character in the sequence needs to be added to the list below of
2203              * characters in the trie, but we count only the first towards the
2204              * min number of characters needed.  This is done through the
2205              * variable 'foldlen', which is returned by the macros that look
2206              * for these sequences as the number of bytes the sequence
2207              * occupies.  Each time through the loop, we decrement 'foldlen' by
2208              * how many bytes the current char occupies.  Only when it reaches
2209              * 0 do we increment 'minchars' or look for another multi-character
2210              * sequence. */
2211             if (folder == NULL) {
2212                 minchars++;
2213             }
2214             else if (foldlen > 0) {
2215                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2216             }
2217             else {
2218                 minchars++;
2219
2220                 /* See if *uc is the beginning of a multi-character fold.  If
2221                  * so, we decrement the length remaining to look at, to account
2222                  * for the current character this iteration.  (We can use 'uc'
2223                  * instead of the fold returned by TRIE_READ_CHAR because for
2224                  * non-UTF, the latin1_safe macro is smart enough to account
2225                  * for all the unfolded characters, and because for UTF, the
2226                  * string will already have been folded earlier in the
2227                  * compilation process */
2228                 if (UTF) {
2229                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2230                         foldlen -= UTF8SKIP(uc);
2231                     }
2232                 }
2233                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2234                     foldlen--;
2235                 }
2236             }
2237
2238             /* The current character (and any potential folds) should be added
2239              * to the possible matching characters for this position in this
2240              * branch */
2241             if ( uvc < 256 ) {
2242                 if ( folder ) {
2243                     U8 folded= folder[ (U8) uvc ];
2244                     if ( !trie->charmap[ folded ] ) {
2245                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2246                         TRIE_STORE_REVCHAR( folded );
2247                     }
2248                 }
2249                 if ( !trie->charmap[ uvc ] ) {
2250                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2251                     TRIE_STORE_REVCHAR( uvc );
2252                 }
2253                 if ( set_bit ) {
2254                     /* store the codepoint in the bitmap, and its folded
2255                      * equivalent. */
2256                     TRIE_BITMAP_SET(trie, uvc);
2257
2258                     /* store the folded codepoint */
2259                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2260
2261                     if ( !UTF ) {
2262                         /* store first byte of utf8 representation of
2263                            variant codepoints */
2264                         if (! UVCHR_IS_INVARIANT(uvc)) {
2265                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2266                         }
2267                     }
2268                     set_bit = 0; /* We've done our bit :-) */
2269                 }
2270             } else {
2271
2272                 /* XXX We could come up with the list of code points that fold
2273                  * to this using PL_utf8_foldclosures, except not for
2274                  * multi-char folds, as there may be multiple combinations
2275                  * there that could work, which needs to wait until runtime to
2276                  * resolve (The comment about LIGATURE FFI above is such an
2277                  * example */
2278
2279                 SV** svpp;
2280                 if ( !widecharmap )
2281                     widecharmap = newHV();
2282
2283                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2284
2285                 if ( !svpp )
2286                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2287
2288                 if ( !SvTRUE( *svpp ) ) {
2289                     sv_setiv( *svpp, ++trie->uniquecharcount );
2290                     TRIE_STORE_REVCHAR(uvc);
2291                 }
2292             }
2293         } /* end loop through characters in this branch of the trie */
2294
2295         /* We take the min and max for this branch and combine to find the min
2296          * and max for all branches processed so far */
2297         if( cur == first ) {
2298             trie->minlen = minchars;
2299             trie->maxlen = maxchars;
2300         } else if (minchars < trie->minlen) {
2301             trie->minlen = minchars;
2302         } else if (maxchars > trie->maxlen) {
2303             trie->maxlen = maxchars;
2304         }
2305     } /* end first pass */
2306     DEBUG_TRIE_COMPILE_r(
2307         PerlIO_printf( Perl_debug_log,
2308                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2309                 (int)depth * 2 + 2,"",
2310                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2311                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2312                 (int)trie->minlen, (int)trie->maxlen )
2313     );
2314
2315     /*
2316         We now know what we are dealing with in terms of unique chars and
2317         string sizes so we can calculate how much memory a naive
2318         representation using a flat table  will take. If it's over a reasonable
2319         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2320         conservative but potentially much slower representation using an array
2321         of lists.
2322
2323         At the end we convert both representations into the same compressed
2324         form that will be used in regexec.c for matching with. The latter
2325         is a form that cannot be used to construct with but has memory
2326         properties similar to the list form and access properties similar
2327         to the table form making it both suitable for fast searches and
2328         small enough that its feasable to store for the duration of a program.
2329
2330         See the comment in the code where the compressed table is produced
2331         inplace from the flat tabe representation for an explanation of how
2332         the compression works.
2333
2334     */
2335
2336
2337     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2338     prev_states[1] = 0;
2339
2340     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2341                                                     > SvIV(re_trie_maxbuff) )
2342     {
2343         /*
2344             Second Pass -- Array Of Lists Representation
2345
2346             Each state will be represented by a list of charid:state records
2347             (reg_trie_trans_le) the first such element holds the CUR and LEN
2348             points of the allocated array. (See defines above).
2349
2350             We build the initial structure using the lists, and then convert
2351             it into the compressed table form which allows faster lookups
2352             (but cant be modified once converted).
2353         */
2354
2355         STRLEN transcount = 1;
2356
2357         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2358             "%*sCompiling trie using list compiler\n",
2359             (int)depth * 2 + 2, ""));
2360
2361         trie->states = (reg_trie_state *)
2362             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2363                                   sizeof(reg_trie_state) );
2364         TRIE_LIST_NEW(1);
2365         next_alloc = 2;
2366
2367         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2368
2369             regnode *noper   = NEXTOPER( cur );
2370             U8 *uc           = (U8*)STRING( noper );
2371             const U8 *e      = uc + STR_LEN( noper );
2372             U32 state        = 1;         /* required init */
2373             U16 charid       = 0;         /* sanity init */
2374             U32 wordlen      = 0;         /* required init */
2375
2376             if (OP(noper) == NOTHING) {
2377                 regnode *noper_next= regnext(noper);
2378                 if (noper_next != tail && OP(noper_next) == flags) {
2379                     noper = noper_next;
2380                     uc= (U8*)STRING(noper);
2381                     e= uc + STR_LEN(noper);
2382                 }
2383             }
2384
2385             if (OP(noper) != NOTHING) {
2386                 for ( ; uc < e ; uc += len ) {
2387
2388                     TRIE_READ_CHAR;
2389
2390                     if ( uvc < 256 ) {
2391                         charid = trie->charmap[ uvc ];
2392                     } else {
2393                         SV** const svpp = hv_fetch( widecharmap,
2394                                                     (char*)&uvc,
2395                                                     sizeof( UV ),
2396                                                     0);
2397                         if ( !svpp ) {
2398                             charid = 0;
2399                         } else {
2400                             charid=(U16)SvIV( *svpp );
2401                         }
2402                     }
2403                     /* charid is now 0 if we dont know the char read, or
2404                      * nonzero if we do */
2405                     if ( charid ) {
2406
2407                         U16 check;
2408                         U32 newstate = 0;
2409
2410                         charid--;
2411                         if ( !trie->states[ state ].trans.list ) {
2412                             TRIE_LIST_NEW( state );
2413                         }
2414                         for ( check = 1;
2415                               check <= TRIE_LIST_USED( state );
2416                               check++ )
2417                         {
2418                             if ( TRIE_LIST_ITEM( state, check ).forid
2419                                                                     == charid )
2420                             {
2421                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2422                                 break;
2423                             }
2424                         }
2425                         if ( ! newstate ) {
2426                             newstate = next_alloc++;
2427                             prev_states[newstate] = state;
2428                             TRIE_LIST_PUSH( state, charid, newstate );
2429                             transcount++;
2430                         }
2431                         state = newstate;
2432                     } else {
2433                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2434                     }
2435                 }
2436             }
2437             TRIE_HANDLE_WORD(state);
2438
2439         } /* end second pass */
2440
2441         /* next alloc is the NEXT state to be allocated */
2442         trie->statecount = next_alloc;
2443         trie->states = (reg_trie_state *)
2444             PerlMemShared_realloc( trie->states,
2445                                    next_alloc
2446                                    * sizeof(reg_trie_state) );
2447
2448         /* and now dump it out before we compress it */
2449         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2450                                                          revcharmap, next_alloc,
2451                                                          depth+1)
2452         );
2453
2454         trie->trans = (reg_trie_trans *)
2455             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2456         {
2457             U32 state;
2458             U32 tp = 0;
2459             U32 zp = 0;
2460
2461
2462             for( state=1 ; state < next_alloc ; state ++ ) {
2463                 U32 base=0;
2464
2465                 /*
2466                 DEBUG_TRIE_COMPILE_MORE_r(
2467                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2468                 );
2469                 */
2470
2471                 if (trie->states[state].trans.list) {
2472                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2473                     U16 maxid=minid;
2474                     U16 idx;
2475
2476                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2477                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2478                         if ( forid < minid ) {
2479                             minid=forid;
2480                         } else if ( forid > maxid ) {
2481                             maxid=forid;
2482                         }
2483                     }
2484                     if ( transcount < tp + maxid - minid + 1) {
2485                         transcount *= 2;
2486                         trie->trans = (reg_trie_trans *)
2487                             PerlMemShared_realloc( trie->trans,
2488                                                      transcount
2489                                                      * sizeof(reg_trie_trans) );
2490                         Zero( trie->trans + (transcount / 2),
2491                               transcount / 2,
2492                               reg_trie_trans );
2493                     }
2494                     base = trie->uniquecharcount + tp - minid;
2495                     if ( maxid == minid ) {
2496                         U32 set = 0;
2497                         for ( ; zp < tp ; zp++ ) {
2498                             if ( ! trie->trans[ zp ].next ) {
2499                                 base = trie->uniquecharcount + zp - minid;
2500                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2501                                                                    1).newstate;
2502                                 trie->trans[ zp ].check = state;
2503                                 set = 1;
2504                                 break;
2505                             }
2506                         }
2507                         if ( !set ) {
2508                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2509                                                                    1).newstate;
2510                             trie->trans[ tp ].check = state;
2511                             tp++;
2512                             zp = tp;
2513                         }
2514                     } else {
2515                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2516                             const U32 tid = base
2517                                            - trie->uniquecharcount
2518                                            + TRIE_LIST_ITEM( state, idx ).forid;
2519                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2520                                                                 idx ).newstate;
2521                             trie->trans[ tid ].check = state;
2522                         }
2523                         tp += ( maxid - minid + 1 );
2524                     }
2525                     Safefree(trie->states[ state ].trans.list);
2526                 }
2527                 /*
2528                 DEBUG_TRIE_COMPILE_MORE_r(
2529                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2530                 );
2531                 */
2532                 trie->states[ state ].trans.base=base;
2533             }
2534             trie->lasttrans = tp + 1;
2535         }
2536     } else {
2537         /*
2538            Second Pass -- Flat Table Representation.
2539
2540            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2541            each.  We know that we will need Charcount+1 trans at most to store
2542            the data (one row per char at worst case) So we preallocate both
2543            structures assuming worst case.
2544
2545            We then construct the trie using only the .next slots of the entry
2546            structs.
2547
2548            We use the .check field of the first entry of the node temporarily
2549            to make compression both faster and easier by keeping track of how
2550            many non zero fields are in the node.
2551
2552            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2553            transition.
2554
2555            There are two terms at use here: state as a TRIE_NODEIDX() which is
2556            a number representing the first entry of the node, and state as a
2557            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2558            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2559            if there are 2 entrys per node. eg:
2560
2561              A B       A B
2562           1. 2 4    1. 3 7
2563           2. 0 3    3. 0 5
2564           3. 0 0    5. 0 0
2565           4. 0 0    7. 0 0
2566
2567            The table is internally in the right hand, idx form. However as we
2568            also have to deal with the states array which is indexed by nodenum
2569            we have to use TRIE_NODENUM() to convert.
2570
2571         */
2572         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2573             "%*sCompiling trie using table compiler\n",
2574             (int)depth * 2 + 2, ""));
2575
2576         trie->trans = (reg_trie_trans *)
2577             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2578                                   * trie->uniquecharcount + 1,
2579                                   sizeof(reg_trie_trans) );
2580         trie->states = (reg_trie_state *)
2581             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2582                                   sizeof(reg_trie_state) );
2583         next_alloc = trie->uniquecharcount + 1;
2584
2585
2586         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2587
2588             regnode *noper   = NEXTOPER( cur );
2589             const U8 *uc     = (U8*)STRING( noper );
2590             const U8 *e      = uc + STR_LEN( noper );
2591
2592             U32 state        = 1;         /* required init */
2593
2594             U16 charid       = 0;         /* sanity init */
2595             U32 accept_state = 0;         /* sanity init */
2596
2597             U32 wordlen      = 0;         /* required init */
2598
2599             if (OP(noper) == NOTHING) {
2600                 regnode *noper_next= regnext(noper);
2601                 if (noper_next != tail && OP(noper_next) == flags) {
2602                     noper = noper_next;
2603                     uc= (U8*)STRING(noper);
2604                     e= uc + STR_LEN(noper);
2605                 }
2606             }
2607
2608             if ( OP(noper) != NOTHING ) {
2609                 for ( ; uc < e ; uc += len ) {
2610
2611                     TRIE_READ_CHAR;
2612
2613                     if ( uvc < 256 ) {
2614                         charid = trie->charmap[ uvc ];
2615                     } else {
2616                         SV* const * const svpp = hv_fetch( widecharmap,
2617                                                            (char*)&uvc,
2618                                                            sizeof( UV ),
2619                                                            0);
2620                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2621                     }
2622                     if ( charid ) {
2623                         charid--;
2624                         if ( !trie->trans[ state + charid ].next ) {
2625                             trie->trans[ state + charid ].next = next_alloc;
2626                             trie->trans[ state ].check++;
2627                             prev_states[TRIE_NODENUM(next_alloc)]
2628                                     = TRIE_NODENUM(state);
2629                             next_alloc += trie->uniquecharcount;
2630                         }
2631                         state = trie->trans[ state + charid ].next;
2632                     } else {
2633                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2634                     }
2635                     /* charid is now 0 if we dont know the char read, or
2636                      * nonzero if we do */
2637                 }
2638             }
2639             accept_state = TRIE_NODENUM( state );
2640             TRIE_HANDLE_WORD(accept_state);
2641
2642         } /* end second pass */
2643
2644         /* and now dump it out before we compress it */
2645         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2646                                                           revcharmap,
2647                                                           next_alloc, depth+1));
2648
2649         {
2650         /*
2651            * Inplace compress the table.*
2652
2653            For sparse data sets the table constructed by the trie algorithm will
2654            be mostly 0/FAIL transitions or to put it another way mostly empty.
2655            (Note that leaf nodes will not contain any transitions.)
2656
2657            This algorithm compresses the tables by eliminating most such
2658            transitions, at the cost of a modest bit of extra work during lookup:
2659
2660            - Each states[] entry contains a .base field which indicates the
2661            index in the state[] array wheres its transition data is stored.
2662
2663            - If .base is 0 there are no valid transitions from that node.
2664
2665            - If .base is nonzero then charid is added to it to find an entry in
2666            the trans array.
2667
2668            -If trans[states[state].base+charid].check!=state then the
2669            transition is taken to be a 0/Fail transition. Thus if there are fail
2670            transitions at the front of the node then the .base offset will point
2671            somewhere inside the previous nodes data (or maybe even into a node
2672            even earlier), but the .check field determines if the transition is
2673            valid.
2674
2675            XXX - wrong maybe?
2676            The following process inplace converts the table to the compressed
2677            table: We first do not compress the root node 1,and mark all its
2678            .check pointers as 1 and set its .base pointer as 1 as well. This
2679            allows us to do a DFA construction from the compressed table later,
2680            and ensures that any .base pointers we calculate later are greater
2681            than 0.
2682
2683            - We set 'pos' to indicate the first entry of the second node.
2684
2685            - We then iterate over the columns of the node, finding the first and
2686            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2687            and set the .check pointers accordingly, and advance pos
2688            appropriately and repreat for the next node. Note that when we copy
2689            the next pointers we have to convert them from the original
2690            NODEIDX form to NODENUM form as the former is not valid post
2691            compression.
2692
2693            - If a node has no transitions used we mark its base as 0 and do not
2694            advance the pos pointer.
2695
2696            - If a node only has one transition we use a second pointer into the
2697            structure to fill in allocated fail transitions from other states.
2698            This pointer is independent of the main pointer and scans forward
2699            looking for null transitions that are allocated to a state. When it
2700            finds one it writes the single transition into the "hole".  If the
2701            pointer doesnt find one the single transition is appended as normal.
2702
2703            - Once compressed we can Renew/realloc the structures to release the
2704            excess space.
2705
2706            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2707            specifically Fig 3.47 and the associated pseudocode.
2708
2709            demq
2710         */
2711         const U32 laststate = TRIE_NODENUM( next_alloc );
2712         U32 state, charid;
2713         U32 pos = 0, zp=0;
2714         trie->statecount = laststate;
2715
2716         for ( state = 1 ; state < laststate ; state++ ) {
2717             U8 flag = 0;
2718             const U32 stateidx = TRIE_NODEIDX( state );
2719             const U32 o_used = trie->trans[ stateidx ].check;
2720             U32 used = trie->trans[ stateidx ].check;
2721             trie->trans[ stateidx ].check = 0;
2722
2723             for ( charid = 0;
2724                   used && charid < trie->uniquecharcount;
2725                   charid++ )
2726             {
2727                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2728                     if ( trie->trans[ stateidx + charid ].next ) {
2729                         if (o_used == 1) {
2730                             for ( ; zp < pos ; zp++ ) {
2731                                 if ( ! trie->trans[ zp ].next ) {
2732                                     break;
2733                                 }
2734                             }
2735                             trie->states[ state ].trans.base
2736                                                     = zp
2737                                                       + trie->uniquecharcount
2738                                                       - charid ;
2739                             trie->trans[ zp ].next
2740                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2741                                                              + charid ].next );
2742                             trie->trans[ zp ].check = state;
2743                             if ( ++zp > pos ) pos = zp;
2744                             break;
2745                         }
2746                         used--;
2747                     }
2748                     if ( !flag ) {
2749                         flag = 1;
2750                         trie->states[ state ].trans.base
2751                                        = pos + trie->uniquecharcount - charid ;
2752                     }
2753                     trie->trans[ pos ].next
2754                         = SAFE_TRIE_NODENUM(
2755                                        trie->trans[ stateidx + charid ].next );
2756                     trie->trans[ pos ].check = state;
2757                     pos++;
2758                 }
2759             }
2760         }
2761         trie->lasttrans = pos + 1;
2762         trie->states = (reg_trie_state *)
2763             PerlMemShared_realloc( trie->states, laststate
2764                                    * sizeof(reg_trie_state) );
2765         DEBUG_TRIE_COMPILE_MORE_r(
2766             PerlIO_printf( Perl_debug_log,
2767                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2768                 (int)depth * 2 + 2,"",
2769                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2770                        + 1 ),
2771                 (IV)next_alloc,
2772                 (IV)pos,
2773                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2774             );
2775
2776         } /* end table compress */
2777     }
2778     DEBUG_TRIE_COMPILE_MORE_r(
2779             PerlIO_printf(Perl_debug_log,
2780                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2781                 (int)depth * 2 + 2, "",
2782                 (UV)trie->statecount,
2783                 (UV)trie->lasttrans)
2784     );
2785     /* resize the trans array to remove unused space */
2786     trie->trans = (reg_trie_trans *)
2787         PerlMemShared_realloc( trie->trans, trie->lasttrans
2788                                * sizeof(reg_trie_trans) );
2789
2790     {   /* Modify the program and insert the new TRIE node */
2791         U8 nodetype =(U8)(flags & 0xFF);
2792         char *str=NULL;
2793
2794 #ifdef DEBUGGING
2795         regnode *optimize = NULL;
2796 #ifdef RE_TRACK_PATTERN_OFFSETS
2797
2798         U32 mjd_offset = 0;
2799         U32 mjd_nodelen = 0;
2800 #endif /* RE_TRACK_PATTERN_OFFSETS */
2801 #endif /* DEBUGGING */
2802         /*
2803            This means we convert either the first branch or the first Exact,
2804            depending on whether the thing following (in 'last') is a branch
2805            or not and whther first is the startbranch (ie is it a sub part of
2806            the alternation or is it the whole thing.)
2807            Assuming its a sub part we convert the EXACT otherwise we convert
2808            the whole branch sequence, including the first.
2809          */
2810         /* Find the node we are going to overwrite */
2811         if ( first != startbranch || OP( last ) == BRANCH ) {
2812             /* branch sub-chain */
2813             NEXT_OFF( first ) = (U16)(last - first);
2814 #ifdef RE_TRACK_PATTERN_OFFSETS
2815             DEBUG_r({
2816                 mjd_offset= Node_Offset((convert));
2817                 mjd_nodelen= Node_Length((convert));
2818             });
2819 #endif
2820             /* whole branch chain */
2821         }
2822 #ifdef RE_TRACK_PATTERN_OFFSETS
2823         else {
2824             DEBUG_r({
2825                 const  regnode *nop = NEXTOPER( convert );
2826                 mjd_offset= Node_Offset((nop));
2827                 mjd_nodelen= Node_Length((nop));
2828             });
2829         }
2830         DEBUG_OPTIMISE_r(
2831             PerlIO_printf(Perl_debug_log,
2832                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2833                 (int)depth * 2 + 2, "",
2834                 (UV)mjd_offset, (UV)mjd_nodelen)
2835         );
2836 #endif
2837         /* But first we check to see if there is a common prefix we can
2838            split out as an EXACT and put in front of the TRIE node.  */
2839         trie->startstate= 1;
2840         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2841             U32 state;
2842             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2843                 U32 ofs = 0;
2844                 I32 idx = -1;
2845                 U32 count = 0;
2846                 const U32 base = trie->states[ state ].trans.base;
2847
2848                 if ( trie->states[state].wordnum )
2849                         count = 1;
2850
2851                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2852                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2853                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2854                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2855                     {
2856                         if ( ++count > 1 ) {
2857                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2858                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2859                             if ( state == 1 ) break;
2860                             if ( count == 2 ) {
2861                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2862                                 DEBUG_OPTIMISE_r(
2863                                     PerlIO_printf(Perl_debug_log,
2864                                         "%*sNew Start State=%"UVuf" Class: [",
2865                                         (int)depth * 2 + 2, "",
2866                                         (UV)state));
2867                                 if (idx >= 0) {
2868                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2869                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2870
2871                                     TRIE_BITMAP_SET(trie,*ch);
2872                                     if ( folder )
2873                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2874                                     DEBUG_OPTIMISE_r(
2875                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2876                                     );
2877                                 }
2878                             }
2879                             TRIE_BITMAP_SET(trie,*ch);
2880                             if ( folder )
2881                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2882                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2883                         }
2884                         idx = ofs;
2885                     }
2886                 }
2887                 if ( count == 1 ) {
2888                     SV **tmp = av_fetch( revcharmap, idx, 0);
2889                     STRLEN len;
2890                     char *ch = SvPV( *tmp, len );
2891                     DEBUG_OPTIMISE_r({
2892                         SV *sv=sv_newmortal();
2893                         PerlIO_printf( Perl_debug_log,
2894                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2895                             (int)depth * 2 + 2, "",
2896                             (UV)state, (UV)idx,
2897                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2898                                 PL_colors[0], PL_colors[1],
2899                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2900                                 PERL_PV_ESCAPE_FIRSTCHAR
2901                             )
2902                         );
2903                     });
2904                     if ( state==1 ) {
2905                         OP( convert ) = nodetype;
2906                         str=STRING(convert);
2907                         STR_LEN(convert)=0;
2908                     }
2909                     STR_LEN(convert) += len;
2910                     while (len--)
2911                         *str++ = *ch++;
2912                 } else {
2913 #ifdef DEBUGGING
2914                     if (state>1)
2915                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2916 #endif
2917                     break;
2918                 }
2919             }
2920             trie->prefixlen = (state-1);
2921             if (str) {
2922                 regnode *n = convert+NODE_SZ_STR(convert);
2923                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2924                 trie->startstate = state;
2925                 trie->minlen -= (state - 1);
2926                 trie->maxlen -= (state - 1);
2927 #ifdef DEBUGGING
2928                /* At least the UNICOS C compiler choked on this
2929                 * being argument to DEBUG_r(), so let's just have
2930                 * it right here. */
2931                if (
2932 #ifdef PERL_EXT_RE_BUILD
2933                    1
2934 #else
2935                    DEBUG_r_TEST
2936 #endif
2937                    ) {
2938                    regnode *fix = convert;
2939                    U32 word = trie->wordcount;
2940                    mjd_nodelen++;
2941                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2942                    while( ++fix < n ) {
2943                        Set_Node_Offset_Length(fix, 0, 0);
2944                    }
2945                    while (word--) {
2946                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2947                        if (tmp) {
2948                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2949                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2950                            else
2951                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2952                        }
2953                    }
2954                }
2955 #endif
2956                 if (trie->maxlen) {
2957                     convert = n;
2958                 } else {
2959                     NEXT_OFF(convert) = (U16)(tail - convert);
2960                     DEBUG_r(optimize= n);
2961                 }
2962             }
2963         }
2964         if (!jumper)
2965             jumper = last;
2966         if ( trie->maxlen ) {
2967             NEXT_OFF( convert ) = (U16)(tail - convert);
2968             ARG_SET( convert, data_slot );
2969             /* Store the offset to the first unabsorbed branch in
2970                jump[0], which is otherwise unused by the jump logic.
2971                We use this when dumping a trie and during optimisation. */
2972             if (trie->jump)
2973                 trie->jump[0] = (U16)(nextbranch - convert);
2974
2975             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2976              *   and there is a bitmap
2977              *   and the first "jump target" node we found leaves enough room
2978              * then convert the TRIE node into a TRIEC node, with the bitmap
2979              * embedded inline in the opcode - this is hypothetically faster.
2980              */
2981             if ( !trie->states[trie->startstate].wordnum
2982                  && trie->bitmap
2983                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2984             {
2985                 OP( convert ) = TRIEC;
2986                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2987                 PerlMemShared_free(trie->bitmap);
2988                 trie->bitmap= NULL;
2989             } else
2990                 OP( convert ) = TRIE;
2991
2992             /* store the type in the flags */
2993             convert->flags = nodetype;
2994             DEBUG_r({
2995             optimize = convert
2996                       + NODE_STEP_REGNODE
2997                       + regarglen[ OP( convert ) ];
2998             });
2999             /* XXX We really should free up the resource in trie now,
3000                    as we won't use them - (which resources?) dmq */
3001         }
3002         /* needed for dumping*/
3003         DEBUG_r(if (optimize) {
3004             regnode *opt = convert;
3005
3006             while ( ++opt < optimize) {
3007                 Set_Node_Offset_Length(opt,0,0);
3008             }
3009             /*
3010                 Try to clean up some of the debris left after the
3011                 optimisation.
3012              */
3013             while( optimize < jumper ) {
3014                 mjd_nodelen += Node_Length((optimize));
3015                 OP( optimize ) = OPTIMIZED;
3016                 Set_Node_Offset_Length(optimize,0,0);
3017                 optimize++;
3018             }
3019             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3020         });
3021     } /* end node insert */
3022
3023     /*  Finish populating the prev field of the wordinfo array.  Walk back
3024      *  from each accept state until we find another accept state, and if
3025      *  so, point the first word's .prev field at the second word. If the
3026      *  second already has a .prev field set, stop now. This will be the
3027      *  case either if we've already processed that word's accept state,
3028      *  or that state had multiple words, and the overspill words were
3029      *  already linked up earlier.
3030      */
3031     {
3032         U16 word;
3033         U32 state;
3034         U16 prev;
3035
3036         for (word=1; word <= trie->wordcount; word++) {
3037             prev = 0;
3038             if (trie->wordinfo[word].prev)
3039                 continue;
3040             state = trie->wordinfo[word].accept;
3041             while (state) {
3042                 state = prev_states[state];
3043                 if (!state)
3044                     break;
3045                 prev = trie->states[state].wordnum;
3046                 if (prev)
3047                     break;
3048             }
3049             trie->wordinfo[word].prev = prev;
3050         }
3051         Safefree(prev_states);
3052     }
3053
3054
3055     /* and now dump out the compressed format */
3056     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3057
3058     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3059 #ifdef DEBUGGING
3060     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3061     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3062 #else
3063     SvREFCNT_dec_NN(revcharmap);
3064 #endif
3065     return trie->jump
3066            ? MADE_JUMP_TRIE
3067            : trie->startstate>1
3068              ? MADE_EXACT_TRIE
3069              : MADE_TRIE;
3070 }
3071
3072 STATIC regnode *
3073 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3074 {
3075 /* The Trie is constructed and compressed now so we can build a fail array if
3076  * it's needed
3077
3078    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3079    3.32 in the
3080    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3081    Ullman 1985/88
3082    ISBN 0-201-10088-6
3083
3084    We find the fail state for each state in the trie, this state is the longest
3085    proper suffix of the current state's 'word' that is also a proper prefix of
3086    another word in our trie. State 1 represents the word '' and is thus the
3087    default fail state. This allows the DFA not to have to restart after its
3088    tried and failed a word at a given point, it simply continues as though it
3089    had been matching the other word in the first place.
3090    Consider
3091       'abcdgu'=~/abcdefg|cdgu/
3092    When we get to 'd' we are still matching the first word, we would encounter
3093    'g' which would fail, which would bring us to the state representing 'd' in
3094    the second word where we would try 'g' and succeed, proceeding to match
3095    'cdgu'.
3096  */
3097  /* add a fail transition */
3098     const U32 trie_offset = ARG(source);
3099     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3100     U32 *q;
3101     const U32 ucharcount = trie->uniquecharcount;
3102     const U32 numstates = trie->statecount;
3103     const U32 ubound = trie->lasttrans + ucharcount;
3104     U32 q_read = 0;
3105     U32 q_write = 0;
3106     U32 charid;
3107     U32 base = trie->states[ 1 ].trans.base;
3108     U32 *fail;
3109     reg_ac_data *aho;
3110     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3111     regnode *stclass;
3112     GET_RE_DEBUG_FLAGS_DECL;
3113
3114     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3115     PERL_UNUSED_CONTEXT;
3116 #ifndef DEBUGGING
3117     PERL_UNUSED_ARG(depth);
3118 #endif
3119
3120     if ( OP(source) == TRIE ) {
3121         struct regnode_1 *op = (struct regnode_1 *)
3122             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3123         StructCopy(source,op,struct regnode_1);
3124         stclass = (regnode *)op;
3125     } else {
3126         struct regnode_charclass *op = (struct regnode_charclass *)
3127             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3128         StructCopy(source,op,struct regnode_charclass);
3129         stclass = (regnode *)op;
3130     }
3131     OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3132
3133     ARG_SET( stclass, data_slot );
3134     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3135     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3136     aho->trie=trie_offset;
3137     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3138     Copy( trie->states, aho->states, numstates, reg_trie_state );
3139     Newxz( q, numstates, U32);
3140     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3141     aho->refcount = 1;
3142     fail = aho->fail;
3143     /* initialize fail[0..1] to be 1 so that we always have
3144        a valid final fail state */
3145     fail[ 0 ] = fail[ 1 ] = 1;
3146
3147     for ( charid = 0; charid < ucharcount ; charid++ ) {
3148         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3149         if ( newstate ) {
3150             q[ q_write ] = newstate;
3151             /* set to point at the root */
3152             fail[ q[ q_write++ ] ]=1;
3153         }
3154     }
3155     while ( q_read < q_write) {
3156         const U32 cur = q[ q_read++ % numstates ];
3157         base = trie->states[ cur ].trans.base;
3158
3159         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3160             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3161             if (ch_state) {
3162                 U32 fail_state = cur;
3163                 U32 fail_base;
3164                 do {
3165                     fail_state = fail[ fail_state ];
3166                     fail_base = aho->states[ fail_state ].trans.base;
3167                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3168
3169                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3170                 fail[ ch_state ] = fail_state;
3171                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3172                 {
3173                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3174                 }
3175                 q[ q_write++ % numstates] = ch_state;
3176             }
3177         }
3178     }
3179     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3180        when we fail in state 1, this allows us to use the
3181        charclass scan to find a valid start char. This is based on the principle
3182        that theres a good chance the string being searched contains lots of stuff
3183        that cant be a start char.
3184      */
3185     fail[ 0 ] = fail[ 1 ] = 0;
3186     DEBUG_TRIE_COMPILE_r({
3187         PerlIO_printf(Perl_debug_log,
3188                       "%*sStclass Failtable (%"UVuf" states): 0",
3189                       (int)(depth * 2), "", (UV)numstates
3190         );
3191         for( q_read=1; q_read<numstates; q_read++ ) {
3192             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3193         }
3194         PerlIO_printf(Perl_debug_log, "\n");
3195     });
3196     Safefree(q);
3197     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3198     return stclass;
3199 }
3200
3201
3202 #define DEBUG_PEEP(str,scan,depth) \
3203     DEBUG_OPTIMISE_r({if (scan){ \
3204        SV * const mysv=sv_newmortal(); \
3205        regnode *Next = regnext(scan); \
3206        regprop(RExC_rx, mysv, scan, NULL); \
3207        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3208        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3209        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3210    }});
3211
3212
3213 /* The below joins as many adjacent EXACTish nodes as possible into a single
3214  * one.  The regop may be changed if the node(s) contain certain sequences that
3215  * require special handling.  The joining is only done if:
3216  * 1) there is room in the current conglomerated node to entirely contain the
3217  *    next one.
3218  * 2) they are the exact same node type
3219  *
3220  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3221  * these get optimized out
3222  *
3223  * If a node is to match under /i (folded), the number of characters it matches
3224  * can be different than its character length if it contains a multi-character
3225  * fold.  *min_subtract is set to the total delta number of characters of the
3226  * input nodes.
3227  *
3228  * And *unfolded_multi_char is set to indicate whether or not the node contains
3229  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3230  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3231  * SMALL LETTER SHARP S, as only if the target string being matched against
3232  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3233  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3234  * whose components are all above the Latin1 range are not run-time locale
3235  * dependent, and have already been folded by the time this function is
3236  * called.)
3237  *
3238  * This is as good a place as any to discuss the design of handling these
3239  * multi-character fold sequences.  It's been wrong in Perl for a very long
3240  * time.  There are three code points in Unicode whose multi-character folds
3241  * were long ago discovered to mess things up.  The previous designs for
3242  * dealing with these involved assigning a special node for them.  This
3243  * approach doesn't always work, as evidenced by this example:
3244  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3245  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3246  * would match just the \xDF, it won't be able to handle the case where a
3247  * successful match would have to cross the node's boundary.  The new approach
3248  * that hopefully generally solves the problem generates an EXACTFU_SS node
3249  * that is "sss" in this case.
3250  *
3251  * It turns out that there are problems with all multi-character folds, and not
3252  * just these three.  Now the code is general, for all such cases.  The
3253  * approach taken is:
3254  * 1)   This routine examines each EXACTFish node that could contain multi-
3255  *      character folded sequences.  Since a single character can fold into
3256  *      such a sequence, the minimum match length for this node is less than
3257  *      the number of characters in the node.  This routine returns in
3258  *      *min_subtract how many characters to subtract from the the actual
3259  *      length of the string to get a real minimum match length; it is 0 if
3260  *      there are no multi-char foldeds.  This delta is used by the caller to
3261  *      adjust the min length of the match, and the delta between min and max,
3262  *      so that the optimizer doesn't reject these possibilities based on size
3263  *      constraints.
3264  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3265  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3266  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3267  *      there is a possible fold length change.  That means that a regular
3268  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3269  *      with length changes, and so can be processed faster.  regexec.c takes
3270  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3271  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3272  *      known until runtime).  This saves effort in regex matching.  However,
3273  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3274  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3275  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3276  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3277  *      possibilities for the non-UTF8 patterns are quite simple, except for
3278  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3279  *      members of a fold-pair, and arrays are set up for all of them so that
3280  *      the other member of the pair can be found quickly.  Code elsewhere in
3281  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3282  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3283  *      described in the next item.
3284  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3285  *      validity of the fold won't be known until runtime, and so must remain
3286  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3287  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3288  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3289  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3290  *      The reason this is a problem is that the optimizer part of regexec.c
3291  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3292  *      that a character in the pattern corresponds to at most a single
3293  *      character in the target string.  (And I do mean character, and not byte
3294  *      here, unlike other parts of the documentation that have never been
3295  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3296  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3297  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3298  *      nodes, violate the assumption, and they are the only instances where it
3299  *      is violated.  I'm reluctant to try to change the assumption, as the
3300  *      code involved is impenetrable to me (khw), so instead the code here
3301  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3302  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3303  *      boolean indicating whether or not the node contains such a fold.  When
3304  *      it is true, the caller sets a flag that later causes the optimizer in
3305  *      this file to not set values for the floating and fixed string lengths,
3306  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3307  *      assumption.  Thus, there is no optimization based on string lengths for
3308  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3309  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3310  *      assumption is wrong only in these cases is that all other non-UTF-8
3311  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3312  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3313  *      EXACTF nodes because we don't know at compile time if it actually
3314  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3315  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3316  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3317  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3318  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3319  *      string would require the pattern to be forced into UTF-8, the overhead
3320  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3321  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3322  *      locale.)
3323  *
3324  *      Similarly, the code that generates tries doesn't currently handle
3325  *      not-already-folded multi-char folds, and it looks like a pain to change
3326  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3327  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3328  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3329  *      using /iaa matching will be doing so almost entirely with ASCII
3330  *      strings, so this should rarely be encountered in practice */
3331
3332 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3333     if (PL_regkind[OP(scan)] == EXACT) \
3334         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3335
3336 STATIC U32
3337 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3338                    UV *min_subtract, bool *unfolded_multi_char,
3339                    U32 flags,regnode *val, U32 depth)
3340 {
3341     /* Merge several consecutive EXACTish nodes into one. */
3342     regnode *n = regnext(scan);
3343     U32 stringok = 1;
3344     regnode *next = scan + NODE_SZ_STR(scan);
3345     U32 merged = 0;
3346     U32 stopnow = 0;
3347 #ifdef DEBUGGING
3348     regnode *stop = scan;
3349     GET_RE_DEBUG_FLAGS_DECL;
3350 #else
3351     PERL_UNUSED_ARG(depth);
3352 #endif
3353
3354     PERL_ARGS_ASSERT_JOIN_EXACT;
3355 #ifndef EXPERIMENTAL_INPLACESCAN
3356     PERL_UNUSED_ARG(flags);
3357     PERL_UNUSED_ARG(val);
3358 #endif
3359     DEBUG_PEEP("join",scan,depth);
3360
3361     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3362      * EXACT ones that are mergeable to the current one. */
3363     while (n
3364            && (PL_regkind[OP(n)] == NOTHING
3365                || (stringok && OP(n) == OP(scan)))
3366            && NEXT_OFF(n)
3367            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3368     {
3369
3370         if (OP(n) == TAIL || n > next)
3371             stringok = 0;
3372         if (PL_regkind[OP(n)] == NOTHING) {
3373             DEBUG_PEEP("skip:",n,depth);
3374             NEXT_OFF(scan) += NEXT_OFF(n);
3375             next = n + NODE_STEP_REGNODE;
3376 #ifdef DEBUGGING
3377             if (stringok)
3378                 stop = n;
3379 #endif
3380             n = regnext(n);
3381         }
3382         else if (stringok) {
3383             const unsigned int oldl = STR_LEN(scan);
3384             regnode * const nnext = regnext(n);
3385
3386             /* XXX I (khw) kind of doubt that this works on platforms (should
3387              * Perl ever run on one) where U8_MAX is above 255 because of lots
3388              * of other assumptions */
3389             /* Don't join if the sum can't fit into a single node */
3390             if (oldl + STR_LEN(n) > U8_MAX)
3391                 break;
3392
3393             DEBUG_PEEP("merg",n,depth);
3394             merged++;
3395
3396             NEXT_OFF(scan) += NEXT_OFF(n);
3397             STR_LEN(scan) += STR_LEN(n);
3398             next = n + NODE_SZ_STR(n);
3399             /* Now we can overwrite *n : */
3400             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3401 #ifdef DEBUGGING
3402             stop = next - 1;
3403 #endif
3404             n = nnext;
3405             if (stopnow) break;
3406         }
3407
3408 #ifdef EXPERIMENTAL_INPLACESCAN
3409         if (flags && !NEXT_OFF(n)) {
3410             DEBUG_PEEP("atch", val, depth);
3411             if (reg_off_by_arg[OP(n)]) {
3412                 ARG_SET(n, val - n);
3413             }
3414             else {
3415                 NEXT_OFF(n) = val - n;
3416             }
3417             stopnow = 1;
3418         }
3419 #endif
3420     }
3421
3422     *min_subtract = 0;
3423     *unfolded_multi_char = FALSE;
3424
3425     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3426      * can now analyze for sequences of problematic code points.  (Prior to
3427      * this final joining, sequences could have been split over boundaries, and
3428      * hence missed).  The sequences only happen in folding, hence for any
3429      * non-EXACT EXACTish node */
3430     if (OP(scan) != EXACT) {
3431         U8* s0 = (U8*) STRING(scan);
3432         U8* s = s0;
3433         U8* s_end = s0 + STR_LEN(scan);
3434
3435         int total_count_delta = 0;  /* Total delta number of characters that
3436                                        multi-char folds expand to */
3437
3438         /* One pass is made over the node's string looking for all the
3439          * possibilities.  To avoid some tests in the loop, there are two main
3440          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3441          * non-UTF-8 */
3442         if (UTF) {
3443             U8* folded = NULL;
3444
3445             if (OP(scan) == EXACTFL) {
3446                 U8 *d;
3447
3448                 /* An EXACTFL node would already have been changed to another
3449                  * node type unless there is at least one character in it that
3450                  * is problematic; likely a character whose fold definition
3451                  * won't be known until runtime, and so has yet to be folded.
3452                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3453                  * to handle the UTF-8 case, we need to create a temporary
3454                  * folded copy using UTF-8 locale rules in order to analyze it.
3455                  * This is because our macros that look to see if a sequence is
3456                  * a multi-char fold assume everything is folded (otherwise the
3457                  * tests in those macros would be too complicated and slow).
3458                  * Note that here, the non-problematic folds will have already
3459                  * been done, so we can just copy such characters.  We actually
3460                  * don't completely fold the EXACTFL string.  We skip the
3461                  * unfolded multi-char folds, as that would just create work
3462                  * below to figure out the size they already are */
3463
3464                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3465                 d = folded;
3466                 while (s < s_end) {
3467                     STRLEN s_len = UTF8SKIP(s);
3468                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3469                         Copy(s, d, s_len, U8);
3470                         d += s_len;
3471                     }
3472                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3473                         *unfolded_multi_char = TRUE;
3474                         Copy(s, d, s_len, U8);
3475                         d += s_len;
3476                     }
3477                     else if (isASCII(*s)) {
3478                         *(d++) = toFOLD(*s);
3479                     }
3480                     else {
3481                         STRLEN len;
3482                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3483                         d += len;
3484                     }
3485                     s += s_len;
3486                 }
3487
3488                 /* Point the remainder of the routine to look at our temporary
3489                  * folded copy */
3490                 s = folded;
3491                 s_end = d;
3492             } /* End of creating folded copy of EXACTFL string */
3493
3494             /* Examine the string for a multi-character fold sequence.  UTF-8
3495              * patterns have all characters pre-folded by the time this code is
3496              * executed */
3497             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3498                                      length sequence we are looking for is 2 */
3499             {
3500                 int count = 0;  /* How many characters in a multi-char fold */
3501                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3502                 if (! len) {    /* Not a multi-char fold: get next char */
3503                     s += UTF8SKIP(s);
3504                     continue;
3505                 }
3506
3507                 /* Nodes with 'ss' require special handling, except for
3508                  * EXACTFA-ish for which there is no multi-char fold to this */
3509                 if (len == 2 && *s == 's' && *(s+1) == 's'
3510                     && OP(scan) != EXACTFA
3511                     && OP(scan) != EXACTFA_NO_TRIE)
3512                 {
3513                     count = 2;
3514                     if (OP(scan) != EXACTFL) {
3515                         OP(scan) = EXACTFU_SS;
3516                     }
3517                     s += 2;
3518                 }
3519                 else { /* Here is a generic multi-char fold. */
3520                     U8* multi_end  = s + len;
3521
3522                     /* Count how many characters are in it.  In the case of
3523                      * /aa, no folds which contain ASCII code points are
3524                      * allowed, so check for those, and skip if found. */
3525                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3526                         count = utf8_length(s, multi_end);
3527                         s = multi_end;
3528                     }
3529                     else {
3530                         while (s < multi_end) {
3531                             if (isASCII(*s)) {
3532                                 s++;
3533                                 goto next_iteration;
3534                             }
3535                             else {
3536                                 s += UTF8SKIP(s);
3537                             }
3538                             count++;
3539                         }
3540                     }
3541                 }
3542
3543                 /* The delta is how long the sequence is minus 1 (1 is how long
3544                  * the character that folds to the sequence is) */
3545                 total_count_delta += count - 1;
3546               next_iteration: ;
3547             }
3548
3549             /* We created a temporary folded copy of the string in EXACTFL
3550              * nodes.  Therefore we need to be sure it doesn't go below zero,
3551              * as the real string could be shorter */
3552             if (OP(scan) == EXACTFL) {
3553                 int total_chars = utf8_length((U8*) STRING(scan),
3554                                            (U8*) STRING(scan) + STR_LEN(scan));
3555                 if (total_count_delta > total_chars) {
3556                     total_count_delta = total_chars;
3557                 }
3558             }
3559
3560             *min_subtract += total_count_delta;
3561             Safefree(folded);
3562         }
3563         else if (OP(scan) == EXACTFA) {
3564
3565             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3566              * fold to the ASCII range (and there are no existing ones in the
3567              * upper latin1 range).  But, as outlined in the comments preceding
3568              * this function, we need to flag any occurrences of the sharp s.
3569              * This character forbids trie formation (because of added
3570              * complexity) */
3571             while (s < s_end) {
3572                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3573                     OP(scan) = EXACTFA_NO_TRIE;
3574                     *unfolded_multi_char = TRUE;
3575                     break;
3576                 }
3577                 s++;
3578                 continue;
3579             }
3580         }
3581         else {
3582
3583             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3584              * folds that are all Latin1.  As explained in the comments
3585              * preceding this function, we look also for the sharp s in EXACTF
3586              * and EXACTFL nodes; it can be in the final position.  Otherwise
3587              * we can stop looking 1 byte earlier because have to find at least
3588              * two characters for a multi-fold */
3589             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3590                               ? s_end
3591                               : s_end -1;
3592
3593             while (s < upper) {
3594                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3595                 if (! len) {    /* Not a multi-char fold. */
3596                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3597                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3598                     {
3599                         *unfolded_multi_char = TRUE;
3600                     }
3601                     s++;
3602                     continue;
3603                 }
3604
3605                 if (len == 2
3606                     && isALPHA_FOLD_EQ(*s, 's')
3607                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3608                 {
3609
3610                     /* EXACTF nodes need to know that the minimum length
3611                      * changed so that a sharp s in the string can match this
3612                      * ss in the pattern, but they remain EXACTF nodes, as they
3613                      * won't match this unless the target string is is UTF-8,
3614                      * which we don't know until runtime.  EXACTFL nodes can't
3615                      * transform into EXACTFU nodes */
3616                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3617                         OP(scan) = EXACTFU_SS;
3618                     }
3619                 }
3620
3621                 *min_subtract += len - 1;
3622                 s += len;
3623             }
3624         }
3625     }
3626
3627 #ifdef DEBUGGING
3628     /* Allow dumping but overwriting the collection of skipped
3629      * ops and/or strings with fake optimized ops */
3630     n = scan + NODE_SZ_STR(scan);
3631     while (n <= stop) {
3632         OP(n) = OPTIMIZED;
3633         FLAGS(n) = 0;
3634         NEXT_OFF(n) = 0;
3635         n++;
3636     }
3637 #endif
3638     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3639     return stopnow;
3640 }
3641
3642 /* REx optimizer.  Converts nodes into quicker variants "in place".
3643    Finds fixed substrings.  */
3644
3645 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3646    to the position after last scanned or to NULL. */
3647
3648 #define INIT_AND_WITHP \
3649     assert(!and_withp); \
3650     Newx(and_withp,1, regnode_ssc); \
3651     SAVEFREEPV(and_withp)
3652
3653 /* this is a chain of data about sub patterns we are processing that
3654    need to be handled separately/specially in study_chunk. Its so
3655    we can simulate recursion without losing state.  */
3656 struct scan_frame;
3657 typedef struct scan_frame {
3658     regnode *last;  /* last node to process in this frame */
3659     regnode *next;  /* next node to process when last is reached */
3660     struct scan_frame *prev; /*previous frame*/
3661     U32 prev_recursed_depth;
3662     I32 stop; /* what stopparen do we use */
3663 } scan_frame;
3664
3665
3666 STATIC SSize_t
3667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3668                         SSize_t *minlenp, SSize_t *deltap,
3669                         regnode *last,
3670                         scan_data_t *data,
3671                         I32 stopparen,
3672                         U32 recursed_depth,
3673                         regnode_ssc *and_withp,
3674                         U32 flags, U32 depth)
3675                         /* scanp: Start here (read-write). */
3676                         /* deltap: Write maxlen-minlen here. */
3677                         /* last: Stop before this one. */
3678                         /* data: string data about the pattern */
3679                         /* stopparen: treat close N as END */
3680                         /* recursed: which subroutines have we recursed into */
3681                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3682 {
3683     /* There must be at least this number of characters to match */
3684     SSize_t min = 0;
3685     I32 pars = 0, code;
3686     regnode *scan = *scanp, *next;
3687     SSize_t delta = 0;
3688     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3689     int is_inf_internal = 0;            /* The studied chunk is infinite */
3690     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3691     scan_data_t data_fake;
3692     SV *re_trie_maxbuff = NULL;
3693     regnode *first_non_open = scan;
3694     SSize_t stopmin = SSize_t_MAX;
3695     scan_frame *frame = NULL;
3696     GET_RE_DEBUG_FLAGS_DECL;
3697
3698     PERL_ARGS_ASSERT_STUDY_CHUNK;
3699
3700 #ifdef DEBUGGING
3701     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3702 #endif
3703     if ( depth == 0 ) {
3704         while (first_non_open && OP(first_non_open) == OPEN)
3705             first_non_open=regnext(first_non_open);
3706     }
3707
3708
3709   fake_study_recurse:
3710     DEBUG_r(
3711         RExC_study_chunk_recursed_count++;
3712     );
3713     while ( scan && OP(scan) != END && scan < last ){
3714         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3715                                    node length to get a real minimum (because
3716                                    the folded version may be shorter) */
3717         bool unfolded_multi_char = FALSE;
3718         /* Peephole optimizer: */
3719         DEBUG_OPTIMISE_MORE_r(
3720         {
3721             PerlIO_printf(Perl_debug_log,
3722                 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu ",
3723                 ((int) depth*2), "", (long)stopparen,
3724                 (unsigned long)RExC_study_chunk_recursed_count,
3725                 (unsigned long)depth, (unsigned long)recursed_depth);
3726             if (recursed_depth) {
3727                 U32 i;
3728                 U32 j;
3729                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3730                     PerlIO_printf(Perl_debug_log,"[");
3731                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3732                         PerlIO_printf(Perl_debug_log,"%d",
3733                             PAREN_TEST(RExC_study_chunk_recursed +
3734                                        (j * RExC_study_chunk_recursed_bytes), i)
3735                             ? 1 : 0
3736                         );
3737                     PerlIO_printf(Perl_debug_log,"]");
3738                 }
3739             }
3740             PerlIO_printf(Perl_debug_log,"\n");
3741         }
3742         );
3743         DEBUG_STUDYDATA("Peep:", data, depth);
3744         DEBUG_PEEP("Peep", scan, depth);
3745
3746
3747         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3748          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3749          * by a different invocation of reg() -- Yves
3750          */
3751         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3752
3753         /* Follow the next-chain of the current node and optimize
3754            away all the NOTHINGs from it.  */
3755         if (OP(scan) != CURLYX) {
3756             const int max = (reg_off_by_arg[OP(scan)]
3757                        ? I32_MAX
3758                        /* I32 may be smaller than U16 on CRAYs! */
3759                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3760             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3761             int noff;
3762             regnode *n = scan;
3763
3764             /* Skip NOTHING and LONGJMP. */
3765             while ((n = regnext(n))
3766                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3767                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3768                    && off + noff < max)
3769                 off += noff;
3770             if (reg_off_by_arg[OP(scan)])
3771                 ARG(scan) = off;
3772             else
3773                 NEXT_OFF(scan) = off;
3774         }
3775
3776
3777
3778         /* The principal pseudo-switch.  Cannot be a switch, since we
3779            look into several different things.  */
3780         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3781                    || OP(scan) == IFTHEN) {
3782             next = regnext(scan);
3783             code = OP(scan);
3784             /* demq: the op(next)==code check is to see if we have
3785              * "branch-branch" AFAICT */
3786
3787             if (OP(next) == code || code == IFTHEN) {
3788                 /* NOTE - There is similar code to this block below for
3789                  * handling TRIE nodes on a re-study.  If you change stuff here
3790                  * check there too. */
3791                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3792                 regnode_ssc accum;
3793                 regnode * const startbranch=scan;
3794
3795                 if (flags & SCF_DO_SUBSTR) {
3796                     /* Cannot merge strings after this. */
3797                     scan_commit(pRExC_state, data, minlenp, is_inf);
3798                 }
3799
3800                 if (flags & SCF_DO_STCLASS)
3801                     ssc_init_zero(pRExC_state, &accum);
3802
3803                 while (OP(scan) == code) {
3804                     SSize_t deltanext, minnext, fake;
3805                     I32 f = 0;
3806                     regnode_ssc this_class;
3807
3808                     num++;
3809                     data_fake.flags = 0;
3810                     if (data) {
3811                         data_fake.whilem_c = data->whilem_c;
3812                         data_fake.last_closep = data->last_closep;
3813                     }
3814                     else
3815                         data_fake.last_closep = &fake;
3816
3817                     data_fake.pos_delta = delta;
3818                     next = regnext(scan);
3819                     scan = NEXTOPER(scan);
3820                     if (code != BRANCH)
3821                         scan = NEXTOPER(scan);
3822                     if (flags & SCF_DO_STCLASS) {
3823                         ssc_init(pRExC_state, &this_class);
3824                         data_fake.start_class = &this_class;
3825                         f = SCF_DO_STCLASS_AND;
3826                     }
3827                     if (flags & SCF_WHILEM_VISITED_POS)
3828                         f |= SCF_WHILEM_VISITED_POS;
3829
3830                     /* we suppose the run is continuous, last=next...*/
3831                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3832                                       &deltanext, next, &data_fake, stopparen,
3833                                       recursed_depth, NULL, f,depth+1);
3834                     if (min1 > minnext)
3835                         min1 = minnext;
3836                     if (deltanext == SSize_t_MAX) {
3837                         is_inf = is_inf_internal = 1;
3838                         max1 = SSize_t_MAX;
3839                     } else if (max1 < minnext + deltanext)
3840                         max1 = minnext + deltanext;
3841                     scan = next;
3842                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3843                         pars++;
3844                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3845                         if ( stopmin > minnext)
3846                             stopmin = min + min1;
3847                         flags &= ~SCF_DO_SUBSTR;
3848                         if (data)
3849                             data->flags |= SCF_SEEN_ACCEPT;
3850                     }
3851                     if (data) {
3852                         if (data_fake.flags & SF_HAS_EVAL)
3853                             data->flags |= SF_HAS_EVAL;
3854                         data->whilem_c = data_fake.whilem_c;
3855                     }
3856                     if (flags & SCF_DO_STCLASS)
3857                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3858                 }
3859                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3860                     min1 = 0;
3861                 if (flags & SCF_DO_SUBSTR) {
3862                     data->pos_min += min1;
3863                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3864                         data->pos_delta = SSize_t_MAX;
3865                     else
3866                         data->pos_delta += max1 - min1;
3867                     if (max1 != min1 || is_inf)
3868                         data->longest = &(data->longest_float);
3869                 }
3870                 min += min1;
3871                 if (delta == SSize_t_MAX
3872                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3873                     delta = SSize_t_MAX;
3874                 else
3875                     delta += max1 - min1;
3876                 if (flags & SCF_DO_STCLASS_OR) {
3877                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3878                     if (min1) {
3879                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3880                         flags &= ~SCF_DO_STCLASS;
3881                     }
3882                 }
3883                 else if (flags & SCF_DO_STCLASS_AND) {
3884                     if (min1) {
3885                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3886                         flags &= ~SCF_DO_STCLASS;
3887                     }
3888                     else {
3889                         /* Switch to OR mode: cache the old value of
3890                          * data->start_class */
3891                         INIT_AND_WITHP;
3892                         StructCopy(data->start_class, and_withp, regnode_ssc);
3893                         flags &= ~SCF_DO_STCLASS_AND;
3894                         StructCopy(&accum, data->start_class, regnode_ssc);
3895                         flags |= SCF_DO_STCLASS_OR;
3896                     }
3897                 }
3898
3899                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3900                         OP( startbranch ) == BRANCH )
3901                 {
3902                 /* demq.
3903
3904                    Assuming this was/is a branch we are dealing with: 'scan'
3905                    now points at the item that follows the branch sequence,
3906                    whatever it is. We now start at the beginning of the
3907                    sequence and look for subsequences of
3908
3909                    BRANCH->EXACT=>x1
3910                    BRANCH->EXACT=>x2
3911                    tail
3912
3913                    which would be constructed from a pattern like
3914                    /A|LIST|OF|WORDS/
3915
3916                    If we can find such a subsequence we need to turn the first
3917                    element into a trie and then add the subsequent branch exact
3918                    strings to the trie.
3919
3920                    We have two cases
3921
3922                      1. patterns where the whole set of branches can be
3923                         converted.
3924
3925                      2. patterns where only a subset can be converted.
3926
3927                    In case 1 we can replace the whole set with a single regop
3928                    for the trie. In case 2 we need to keep the start and end
3929                    branches so
3930
3931                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3932                      becomes BRANCH TRIE; BRANCH X;
3933
3934                   There is an additional case, that being where there is a
3935                   common prefix, which gets split out into an EXACT like node
3936                   preceding the TRIE node.
3937
3938                   If x(1..n)==tail then we can do a simple trie, if not we make
3939                   a "jump" trie, such that when we match the appropriate word
3940                   we "jump" to the appropriate tail node. Essentially we turn
3941                   a nested if into a case structure of sorts.
3942
3943                 */
3944
3945                     int made=0;
3946                     if (!re_trie_maxbuff) {
3947                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3948                         if (!SvIOK(re_trie_maxbuff))
3949                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3950                     }
3951                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3952                         regnode *cur;
3953                         regnode *first = (regnode *)NULL;
3954                         regnode *last = (regnode *)NULL;
3955                         regnode *tail = scan;
3956                         U8 trietype = 0;
3957                         U32 count=0;
3958
3959 #ifdef DEBUGGING
3960                         SV * const mysv = sv_newmortal();   /* for dumping */
3961 #endif
3962                         /* var tail is used because there may be a TAIL
3963                            regop in the way. Ie, the exacts will point to the
3964                            thing following the TAIL, but the last branch will
3965                            point at the TAIL. So we advance tail. If we
3966                            have nested (?:) we may have to move through several
3967                            tails.
3968                          */
3969
3970                         while ( OP( tail ) == TAIL ) {
3971                             /* this is the TAIL generated by (?:) */
3972                             tail = regnext( tail );
3973                         }
3974
3975
3976                         DEBUG_TRIE_COMPILE_r({
3977                             regprop(RExC_rx, mysv, tail, NULL);
3978                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3979                               (int)depth * 2 + 2, "",
3980                               "Looking for TRIE'able sequences. Tail node is: ",
3981                               SvPV_nolen_const( mysv )
3982                             );
3983                         });
3984
3985                         /*
3986
3987                             Step through the branches
3988                                 cur represents each branch,
3989                                 noper is the first thing to be matched as part
3990                                       of that branch
3991                                 noper_next is the regnext() of that node.
3992
3993                             We normally handle a case like this
3994                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3995                             support building with NOJUMPTRIE, which restricts
3996                             the trie logic to structures like /FOO|BAR/.
3997
3998                             If noper is a trieable nodetype then the branch is
3999                             a possible optimization target. If we are building
4000                             under NOJUMPTRIE then we require that noper_next is
4001                             the same as scan (our current position in the regex
4002                             program).
4003
4004                             Once we have two or more consecutive such branches
4005                             we can create a trie of the EXACT's contents and
4006                             stitch it in place into the program.
4007
4008                             If the sequence represents all of the branches in
4009                             the alternation we replace the entire thing with a
4010                             single TRIE node.
4011
4012                             Otherwise when it is a subsequence we need to
4013                             stitch it in place and replace only the relevant
4014                             branches. This means the first branch has to remain
4015                             as it is used by the alternation logic, and its
4016                             next pointer, and needs to be repointed at the item
4017                             on the branch chain following the last branch we
4018                             have optimized away.
4019
4020                             This could be either a BRANCH, in which case the
4021                             subsequence is internal, or it could be the item
4022                             following the branch sequence in which case the
4023                             subsequence is at the end (which does not
4024                             necessarily mean the first node is the start of the
4025                             alternation).
4026
4027                             TRIE_TYPE(X) is a define which maps the optype to a
4028                             trietype.
4029
4030                                 optype          |  trietype
4031                                 ----------------+-----------
4032                                 NOTHING         | NOTHING
4033                                 EXACT           | EXACT
4034                                 EXACTFU         | EXACTFU
4035                                 EXACTFU_SS      | EXACTFU
4036                                 EXACTFA         | EXACTFA
4037
4038
4039                         */
4040 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
4041                        ( EXACT == (X) )   ? EXACT :        \
4042                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
4043                        ( EXACTFA == (X) ) ? EXACTFA :        \
4044                        0 )
4045
4046                         /* dont use tail as the end marker for this traverse */
4047                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4048                             regnode * const noper = NEXTOPER( cur );
4049                             U8 noper_type = OP( noper );
4050                             U8 noper_trietype = TRIE_TYPE( noper_type );
4051 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4052                             regnode * const noper_next = regnext( noper );
4053                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4054                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4055 #endif
4056
4057                             DEBUG_TRIE_COMPILE_r({
4058                                 regprop(RExC_rx, mysv, cur, NULL);
4059                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4060                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
4061
4062                                 regprop(RExC_rx, mysv, noper, NULL);
4063                                 PerlIO_printf( Perl_debug_log, " -> %s",
4064                                     SvPV_nolen_const(mysv));
4065
4066                                 if ( noper_next ) {
4067                                   regprop(RExC_rx, mysv, noper_next, NULL);
4068                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4069                                     SvPV_nolen_const(mysv));
4070                                 }
4071                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4072                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4073                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4074                                 );
4075                             });
4076
4077                             /* Is noper a trieable nodetype that can be merged
4078                              * with the current trie (if there is one)? */
4079                             if ( noper_trietype
4080                                   &&
4081                                   (
4082                                         ( noper_trietype == NOTHING)
4083                                         || ( trietype == NOTHING )
4084                                         || ( trietype == noper_trietype )
4085                                   )
4086 #ifdef NOJUMPTRIE
4087                                   && noper_next == tail
4088 #endif
4089                                   && count < U16_MAX)
4090                             {
4091                                 /* Handle mergable triable node Either we are
4092                                  * the first node in a new trieable sequence,
4093                                  * in which case we do some bookkeeping,
4094                                  * otherwise we update the end pointer. */
4095                                 if ( !first ) {
4096                                     first = cur;
4097                                     if ( noper_trietype == NOTHING ) {
4098 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4099                                         regnode * const noper_next = regnext( noper );
4100                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4101                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4102 #endif
4103
4104                                         if ( noper_next_trietype ) {
4105                                             trietype = noper_next_trietype;
4106                                         } else if (noper_next_type)  {
4107                                             /* a NOTHING regop is 1 regop wide.
4108                                              * We need at least two for a trie
4109                                              * so we can't merge this in */
4110                                             first = NULL;
4111                                         }
4112                                     } else {
4113                                         trietype = noper_trietype;
4114                                     }
4115                                 } else {
4116                                     if ( trietype == NOTHING )
4117                                         trietype = noper_trietype;
4118                                     last = cur;
4119                                 }
4120                                 if (first)
4121                                     count++;
4122                             } /* end handle mergable triable node */
4123                             else {
4124                                 /* handle unmergable node -
4125                                  * noper may either be a triable node which can
4126                                  * not be tried together with the current trie,
4127                                  * or a non triable node */
4128                                 if ( last ) {
4129                                     /* If last is set and trietype is not
4130                                      * NOTHING then we have found at least two
4131                                      * triable branch sequences in a row of a
4132                                      * similar trietype so we can turn them
4133                                      * into a trie. If/when we allow NOTHING to
4134                                      * start a trie sequence this condition
4135                                      * will be required, and it isn't expensive
4136                                      * so we leave it in for now. */
4137                                     if ( trietype && trietype != NOTHING )
4138                                         make_trie( pRExC_state,
4139                                                 startbranch, first, cur, tail,
4140                                                 count, trietype, depth+1 );
4141                                     last = NULL; /* note: we clear/update
4142                                                     first, trietype etc below,
4143                                                     so we dont do it here */
4144                                 }
4145                                 if ( noper_trietype
4146 #ifdef NOJUMPTRIE
4147                                      && noper_next == tail
4148 #endif
4149                                 ){
4150                                     /* noper is triable, so we can start a new
4151                                      * trie sequence */
4152                                     count = 1;
4153                                     first = cur;
4154                                     trietype = noper_trietype;
4155                                 } else if (first) {
4156                                     /* if we already saw a first but the
4157                                      * current node is not triable then we have
4158                                      * to reset the first information. */
4159                                     count = 0;
4160                                     first = NULL;
4161                                     trietype = 0;
4162                                 }
4163                             } /* end handle unmergable node */
4164                         } /* loop over branches */
4165                         DEBUG_TRIE_COMPILE_r({
4166                             regprop(RExC_rx, mysv, cur, NULL);
4167                             PerlIO_printf( Perl_debug_log,
4168                               "%*s- %s (%d) <SCAN FINISHED>\n",
4169                               (int)depth * 2 + 2,
4170                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4171
4172                         });
4173                         if ( last && trietype ) {
4174                             if ( trietype != NOTHING ) {
4175                                 /* the last branch of the sequence was part of
4176                                  * a trie, so we have to construct it here
4177                                  * outside of the loop */
4178                                 made= make_trie( pRExC_state, startbranch,
4179                                                  first, scan, tail, count,
4180                                                  trietype, depth+1 );
4181 #ifdef TRIE_STUDY_OPT
4182                                 if ( ((made == MADE_EXACT_TRIE &&
4183                                      startbranch == first)
4184                                      || ( first_non_open == first )) &&
4185                                      depth==0 ) {
4186                                     flags |= SCF_TRIE_RESTUDY;
4187                                     if ( startbranch == first
4188                                          && scan == tail )
4189                                     {
4190                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4191                                     }
4192                                 }
4193 #endif
4194                             } else {
4195                                 /* at this point we know whatever we have is a
4196                                  * NOTHING sequence/branch AND if 'startbranch'
4197                                  * is 'first' then we can turn the whole thing
4198                                  * into a NOTHING
4199                                  */
4200                                 if ( startbranch == first ) {
4201                                     regnode *opt;
4202                                     /* the entire thing is a NOTHING sequence,
4203                                      * something like this: (?:|) So we can
4204                                      * turn it into a plain NOTHING op. */
4205                                     DEBUG_TRIE_COMPILE_r({
4206                                         regprop(RExC_rx, mysv, cur, NULL);
4207                                         PerlIO_printf( Perl_debug_log,
4208                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4209                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4210
4211                                     });
4212                                     OP(startbranch)= NOTHING;
4213                                     NEXT_OFF(startbranch)= tail - startbranch;
4214                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4215                                         OP(opt)= OPTIMIZED;
4216                                 }
4217                             }
4218                         } /* end if ( last) */
4219                     } /* TRIE_MAXBUF is non zero */
4220
4221                 } /* do trie */
4222
4223             }
4224             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4225                 scan = NEXTOPER(NEXTOPER(scan));
4226             } else                      /* single branch is optimized. */
4227                 scan = NEXTOPER(scan);
4228             continue;
4229         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4230             scan_frame *newframe = NULL;
4231             I32 paren;
4232             regnode *start;
4233             regnode *end;
4234             U32 my_recursed_depth= recursed_depth;
4235
4236             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4237                 /* set the pointer */
4238                 if (OP(scan) == GOSUB) {
4239                     paren = ARG(scan);
4240                     RExC_recurse[ARG2L(scan)] = scan;
4241                     start = RExC_open_parens[paren-1];
4242                     end   = RExC_close_parens[paren-1];
4243                 } else {
4244                     paren = 0;
4245                     start = RExC_rxi->program + 1;
4246                     end   = RExC_opend;
4247                 }
4248                 /* this code is intended to handle expanding regex "subs" so
4249                  * we can apply various optimizations. For instance with
4250                  * /(?(DEFINE)(?<foo>foo)(?<bar>bar))(?&foo)(?&bar)/ we
4251                  * want to recognize that the mandatory substr is going to be
4252                  * "foobar".
4253                  * However if we are not in SCF_DO_SUBSTR mode then there is
4254                  * no point in doing this, and it can cause a serious slowdown.
4255                  * See RT #122283.
4256                  * Note also that this was a workaround for the core problem
4257                  * which was that during compilation logic the excessive
4258                  * recursion resulted in slowly consuming all the memory on
4259                  * the box. Exactly what causes this is unclear. It does not
4260                  * appear to be directly related to allocating the "visited"
4261                  * bitmaps that is RExC_study_chunk_recursed.
4262                  *
4263                  * In reality study_chunk() does far far too much, and probably
4264                  * this an other issues would go away if we split it into
4265                  * multiple components.
4266                  *
4267                  * - Yves
4268                  * */
4269                 if (flags & SCF_DO_SUBSTR) {
4270                 if (
4271                     !recursed_depth
4272                     ||
4273                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4274                 ) {
4275                     /* it is quite possible that there are more efficient ways
4276                      * to do this. We maintain a bitmap per level of recursion
4277                      * of which patterns we have entered so we can detect if a
4278                      * pattern creates a possible infinite loop. When we
4279                      * recurse down a level we copy the previous levels bitmap
4280                      * down. When we are at recursion level 0 we zero the top
4281                      * level bitmap. It would be nice to implement a different
4282                      * more efficient way of doing this. In particular the top
4283                      * level bitmap may be unnecessary.
4284                      */
4285                     if (!recursed_depth) {
4286                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4287                     } else {
4288                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4289                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4290                              RExC_study_chunk_recursed_bytes, U8);
4291                     }
4292                     /* we havent recursed into this paren yet, so recurse into it */
4293                     DEBUG_STUDYDATA("set:", data,depth);
4294                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4295                     my_recursed_depth= recursed_depth + 1;
4296                     Newx(newframe,1,scan_frame);
4297                 } else {
4298                     DEBUG_STUDYDATA("inf:", data,depth);
4299                     /* some form of infinite recursion, assume infinite length
4300                      * */
4301                     if (flags & SCF_DO_SUBSTR) {
4302                         scan_commit(pRExC_state, data, minlenp, is_inf);
4303                         data->longest = &(data->longest_float);
4304                     }
4305                     is_inf = is_inf_internal = 1;
4306                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4307                         ssc_anything(data->start_class);
4308                     flags &= ~SCF_DO_STCLASS;
4309                 }
4310                 }
4311             } else {
4312                 Newx(newframe,1,scan_frame);
4313                 paren = stopparen;
4314                 start = scan+2;
4315                 end = regnext(scan);
4316             }
4317             if (newframe) {
4318                 assert(start);
4319                 assert(end);
4320                 SAVEFREEPV(newframe);
4321                 newframe->next = regnext(scan);
4322                 newframe->last = last;
4323                 newframe->stop = stopparen;
4324                 newframe->prev = frame;
4325                 newframe->prev_recursed_depth = recursed_depth;
4326
4327                 DEBUG_STUDYDATA("frame-new:",data,depth);
4328                 DEBUG_PEEP("fnew", scan, depth);
4329
4330                 frame = newframe;
4331                 scan =  start;
4332                 stopparen = paren;
4333                 last = end;
4334                 depth = depth + 1;
4335                 recursed_depth= my_recursed_depth;
4336
4337                 continue;
4338             }
4339         }
4340         else if (OP(scan) == EXACT) {
4341             SSize_t l = STR_LEN(scan);
4342             UV uc;
4343             if (UTF) {
4344                 const U8 * const s = (U8*)STRING(scan);
4345                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4346                 l = utf8_length(s, s + l);
4347             } else {
4348                 uc = *((U8*)STRING(scan));
4349             }
4350             min += l;
4351             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4352                 /* The code below prefers earlier match for fixed
4353                    offset, later match for variable offset.  */
4354                 if (data->last_end == -1) { /* Update the start info. */
4355                     data->last_start_min = data->pos_min;
4356                     data->last_start_max = is_inf
4357                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4358                 }
4359                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4360                 if (UTF)
4361                     SvUTF8_on(data->last_found);
4362                 {
4363                     SV * const sv = data->last_found;
4364                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4365                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4366                     if (mg && mg->mg_len >= 0)
4367                         mg->mg_len += utf8_length((U8*)STRING(scan),
4368                                               (U8*)STRING(scan)+STR_LEN(scan));
4369                 }
4370                 data->last_end = data->pos_min + l;
4371                 data->pos_min += l; /* As in the first entry. */
4372                 data->flags &= ~SF_BEFORE_EOL;
4373             }
4374
4375             /* ANDing the code point leaves at most it, and not in locale, and
4376              * can't match null string */
4377             if (flags & SCF_DO_STCLASS_AND) {
4378                 ssc_cp_and(data->start_class, uc);
4379                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4380                 ssc_clear_locale(data->start_class);
4381             }
4382             else if (flags & SCF_DO_STCLASS_OR) {
4383                 ssc_add_cp(data->start_class, uc);
4384                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4385
4386                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4387                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4388             }
4389             flags &= ~SCF_DO_STCLASS;
4390         }
4391         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4392                                                      EXACTFish */
4393             SSize_t l = STR_LEN(scan);
4394             UV uc = *((U8*)STRING(scan));
4395             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4396                                                      separate code points */
4397             const U8 * s = (U8*)STRING(scan);
4398
4399             /* Search for fixed substrings supports EXACT only. */
4400             if (flags & SCF_DO_SUBSTR) {
4401                 assert(data);
4402                 scan_commit(pRExC_state, data, minlenp, is_inf);
4403             }
4404             if (UTF) {
4405                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4406                 l = utf8_length(s, s + l);
4407             }
4408             if (unfolded_multi_char) {
4409                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4410             }
4411             min += l - min_subtract;
4412             assert (min >= 0);
4413             delta += min_subtract;
4414             if (flags & SCF_DO_SUBSTR) {
4415                 data->pos_min += l - min_subtract;
4416                 if (data->pos_min < 0) {
4417                     data->pos_min = 0;
4418                 }
4419                 data->pos_delta += min_subtract;
4420                 if (min_subtract) {
4421                     data->longest = &(data->longest_float);
4422                 }
4423             }
4424
4425             if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4426                 ssc_clear_locale(data->start_class);
4427             }
4428
4429             if (! UTF) {
4430
4431                 /* We punt and assume can match anything if the node begins
4432                  * with a multi-character fold.  Things are complicated.  For
4433                  * example, /ffi/i could match any of:
4434                  *  "\N{LATIN SMALL LIGATURE FFI}"
4435                  *  "\N{LATIN SMALL LIGATURE FF}I"
4436                  *  "F\N{LATIN SMALL LIGATURE FI}"
4437                  *  plus several other things; and making sure we have all the
4438                  *  possibilities is hard. */
4439                 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4440                     EXACTF_invlist =
4441                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4442                 }
4443                 else {
4444
4445                     /* Any Latin1 range character can potentially match any
4446                      * other depending on the locale */
4447                     if (OP(scan) == EXACTFL) {
4448                         _invlist_union(EXACTF_invlist, PL_Latin1,
4449                                                               &EXACTF_invlist);
4450                     }
4451                     else {
4452                         /* But otherwise, it matches at least itself.  We can
4453                          * quickly tell if it has a distinct fold, and if so,
4454                          * it matches that as well */
4455                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4456                         if (IS_IN_SOME_FOLD_L1(uc)) {
4457                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4458                                                            PL_fold_latin1[uc]);
4459                         }
4460                     }
4461
4462                     /* Some characters match above-Latin1 ones under /i.  This
4463                      * is true of EXACTFL ones when the locale is UTF-8 */
4464                     if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4465                         && (! isASCII(uc) || (OP(scan) != EXACTFA
4466                                             && OP(scan) != EXACTFA_NO_TRIE)))
4467                     {
4468                         add_above_Latin1_folds(pRExC_state,
4469                                                (U8) uc,
4470                                                &EXACTF_invlist);
4471                     }
4472                 }
4473             }
4474             else {  /* Pattern is UTF-8 */
4475                 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4476                 STRLEN foldlen = UTF8SKIP(s);
4477                 const U8* e = s + STR_LEN(scan);
4478                 SV** listp;
4479
4480                 /* The only code points that aren't folded in a UTF EXACTFish
4481                  * node are are the problematic ones in EXACTFL nodes */
4482                 if (OP(scan) == EXACTFL
4483                     && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4484                 {
4485                     /* We need to check for the possibility that this EXACTFL
4486                      * node begins with a multi-char fold.  Therefore we fold
4487                      * the first few characters of it so that we can make that
4488                      * check */
4489                     U8 *d = folded;
4490                     int i;
4491
4492                     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4493                         if (isASCII(*s)) {
4494                             *(d++) = (U8) toFOLD(*s);
4495                             s++;
4496                         }
4497                         else {
4498                             STRLEN len;
4499                             to_utf8_fold(s, d, &len);
4500                             d += len;
4501                             s += UTF8SKIP(s);
4502                         }
4503                     }
4504
4505                     /* And set up so the code below that looks in this folded
4506                      * buffer instead of the node's string */
4507                     e = d;
4508                     foldlen = UTF8SKIP(folded);
4509                     s = folded;
4510                 }
4511
4512                 /* When we reach here 's' points to the fold of the first
4513                  * character(s) of the node; and 'e' points to far enough along
4514                  * the folded string to be just past any possible multi-char
4515                  * fold. 'foldlen' is the length in bytes of the first
4516                  * character in 's'
4517                  *
4518                  * Unlike the non-UTF-8 case, the macro for determining if a
4519                  * string is a multi-char fold requires all the characters to
4520                  * already be folded.  This is because of all the complications
4521                  * if not.  Note that they are folded anyway, except in EXACTFL
4522                  * nodes.  Like the non-UTF case above, we punt if the node
4523                  * begins with a multi-char fold  */
4524
4525                 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4526                     EXACTF_invlist =
4527                              _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4528                 }
4529                 else {  /* Single char fold */
4530
4531                     /* It matches all the things that fold to it, which are
4532                      * found in PL_utf8_foldclosures (including itself) */
4533                     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4534                     if (! PL_utf8_foldclosures) {
4535                         _load_PL_utf8_foldclosures();
4536                     }
4537                     if ((listp = hv_fetch(PL_utf8_foldclosures,
4538                                         (char *) s, foldlen, FALSE)))
4539                     {
4540                         AV* list = (AV*) *listp;
4541                         IV k;
4542                         for (k = 0; k <= av_tindex(list); k++) {
4543                             SV** c_p = av_fetch(list, k, FALSE);
4544                             UV c;
4545                             assert(c_p);
4546
4547                             c = SvUV(*c_p);
4548
4549                             /* /aa doesn't allow folds between ASCII and non- */
4550                             if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4551                                 && isASCII(c) != isASCII(uc))
4552                             {
4553                                 continue;
4554                             }
4555
4556                             EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4557                         }
4558                     }
4559                 }
4560             }
4561             if (flags & SCF_DO_STCLASS_AND) {
4562                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4563                 ANYOF_POSIXL_ZERO(data->start_class);
4564                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4565             }
4566             else if (flags & SCF_DO_STCLASS_OR) {
4567                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4568                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4569
4570                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4571                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4572             }
4573             flags &= ~SCF_DO_STCLASS;
4574             SvREFCNT_dec(EXACTF_invlist);
4575         }
4576         else if (REGNODE_VARIES(OP(scan))) {
4577             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4578             I32 fl = 0, f = flags;
4579             regnode * const oscan = scan;
4580             regnode_ssc this_class;
4581             regnode_ssc *oclass = NULL;
4582             I32 next_is_eval = 0;
4583
4584             switch (PL_regkind[OP(scan)]) {
4585             case WHILEM:                /* End of (?:...)* . */
4586                 scan = NEXTOPER(scan);
4587                 goto finish;
4588             case PLUS:
4589                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4590                     next = NEXTOPER(scan);
4591                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4592                         mincount = 1;
4593                         maxcount = REG_INFTY;
4594                         next = regnext(scan);
4595                         scan = NEXTOPER(scan);
4596                         goto do_curly;
4597                     }
4598                 }
4599                 if (flags & SCF_DO_SUBSTR)
4600                     data->pos_min++;
4601                 min++;
4602                 /* FALLTHROUGH */
4603             case STAR:
4604                 if (flags & SCF_DO_STCLASS) {
4605                     mincount = 0;
4606                     maxcount = REG_INFTY;
4607                     next = regnext(scan);
4608                     scan = NEXTOPER(scan);
4609                     goto do_curly;
4610                 }
4611                 if (flags & SCF_DO_SUBSTR) {
4612                     scan_commit(pRExC_state, data, minlenp, is_inf);
4613                     /* Cannot extend fixed substrings */
4614                     data->longest = &(data->longest_float);
4615                 }
4616                 is_inf = is_inf_internal = 1;
4617                 scan = regnext(scan);
4618                 goto optimize_curly_tail;
4619             case CURLY:
4620                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4621                     && (scan->flags == stopparen))
4622                 {
4623                     mincount = 1;
4624                     maxcount = 1;
4625                 } else {
4626                     mincount = ARG1(scan);
4627                     maxcount = ARG2(scan);
4628                 }
4629                 next = regnext(scan);
4630                 if (OP(scan) == CURLYX) {
4631                     I32 lp = (data ? *(data->last_closep) : 0);
4632                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4633                 }
4634                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4635                 next_is_eval = (OP(scan) == EVAL);
4636               do_curly:
4637                 if (flags & SCF_DO_SUBSTR) {
4638                     if (mincount == 0)
4639                         scan_commit(pRExC_state, data, minlenp, is_inf);
4640                     /* Cannot extend fixed substrings */
4641                     pos_before = data->pos_min;
4642                 }
4643                 if (data) {
4644                     fl = data->flags;
4645                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4646                     if (is_inf)
4647                         data->flags |= SF_IS_INF;
4648                 }
4649                 if (flags & SCF_DO_STCLASS) {
4650                     ssc_init(pRExC_state, &this_class);
4651                     oclass = data->start_class;
4652                     data->start_class = &this_class;
4653                     f |= SCF_DO_STCLASS_AND;
4654                     f &= ~SCF_DO_STCLASS_OR;
4655                 }
4656                 /* Exclude from super-linear cache processing any {n,m}
4657                    regops for which the combination of input pos and regex
4658                    pos is not enough information to determine if a match
4659                    will be possible.
4660
4661                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4662                    regex pos at the \s*, the prospects for a match depend not
4663                    only on the input position but also on how many (bar\s*)
4664                    repeats into the {4,8} we are. */
4665                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4666                     f &= ~SCF_WHILEM_VISITED_POS;
4667
4668                 /* This will finish on WHILEM, setting scan, or on NULL: */
4669                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4670                                   last, data, stopparen, recursed_depth, NULL,
4671                                   (mincount == 0
4672                                    ? (f & ~SCF_DO_SUBSTR)
4673                                    : f)
4674                                   ,depth+1);
4675
4676                 if (flags & SCF_DO_STCLASS)
4677                     data->start_class = oclass;
4678                 if (mincount == 0 || minnext == 0) {
4679                     if (flags & SCF_DO_STCLASS_OR) {
4680                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4681                     }
4682                     else if (flags & SCF_DO_STCLASS_AND) {
4683                         /* Switch to OR mode: cache the old value of
4684                          * data->start_class */
4685                         INIT_AND_WITHP;
4686                         StructCopy(data->start_class, and_withp, regnode_ssc);
4687                         flags &= ~SCF_DO_STCLASS_AND;
4688                         StructCopy(&this_class, data->start_class, regnode_ssc);
4689                         flags |= SCF_DO_STCLASS_OR;
4690                         ANYOF_FLAGS(data->start_class)
4691                                                 |= SSC_MATCHES_EMPTY_STRING;
4692                     }
4693                 } else {                /* Non-zero len */
4694                     if (flags & SCF_DO_STCLASS_OR) {
4695                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4696                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4697                     }
4698                     else if (flags & SCF_DO_STCLASS_AND)
4699                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4700                     flags &= ~SCF_DO_STCLASS;
4701                 }
4702                 if (!scan)              /* It was not CURLYX, but CURLY. */
4703                     scan = next;
4704                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4705                     /* ? quantifier ok, except for (?{ ... }) */
4706                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4707                     && (minnext == 0) && (deltanext == 0)
4708                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4709                     && maxcount <= REG_INFTY/3) /* Complement check for big
4710                                                    count */
4711                 {
4712                     /* Fatal warnings may leak the regexp without this: */
4713                     SAVEFREESV(RExC_rx_sv);
4714                     ckWARNreg(RExC_parse,
4715                             "Quantifier unexpected on zero-length expression");
4716                     (void)ReREFCNT_inc(RExC_rx_sv);
4717                 }
4718
4719                 min += minnext * mincount;
4720                 is_inf_internal |= deltanext == SSize_t_MAX
4721                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4722                 is_inf |= is_inf_internal;
4723                 if (is_inf) {
4724                     delta = SSize_t_MAX;
4725                 } else {
4726                     delta += (minnext + deltanext) * maxcount
4727                              - minnext * mincount;
4728                 }
4729                 /* Try powerful optimization CURLYX => CURLYN. */
4730                 if (  OP(oscan) == CURLYX && data
4731                       && data->flags & SF_IN_PAR
4732                       && !(data->flags & SF_HAS_EVAL)
4733                       && !deltanext && minnext == 1 ) {
4734                     /* Try to optimize to CURLYN.  */
4735                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4736                     regnode * const nxt1 = nxt;
4737 #ifdef DEBUGGING
4738                     regnode *nxt2;
4739 #endif
4740
4741                     /* Skip open. */
4742                     nxt = regnext(nxt);
4743                     if (!REGNODE_SIMPLE(OP(nxt))
4744                         && !(PL_regkind[OP(nxt)] == EXACT
4745                              && STR_LEN(nxt) == 1))
4746                         goto nogo;
4747 #ifdef DEBUGGING
4748                     nxt2 = nxt;
4749 #endif
4750                     nxt = regnext(nxt);
4751                     if (OP(nxt) != CLOSE)
4752                         goto nogo;
4753                     if (RExC_open_parens) {
4754                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4755                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4756                     }
4757                     /* Now we know that nxt2 is the only contents: */
4758                     oscan->flags = (U8)ARG(nxt);
4759                     OP(oscan) = CURLYN;
4760                     OP(nxt1) = NOTHING; /* was OPEN. */
4761
4762 #ifdef DEBUGGING
4763                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4764                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4765                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4766                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4767                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4768                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4769 #endif
4770                 }
4771               nogo:
4772
4773                 /* Try optimization CURLYX => CURLYM. */
4774                 if (  OP(oscan) == CURLYX && data
4775                       && !(data->flags & SF_HAS_PAR)
4776                       && !(data->flags & SF_HAS_EVAL)
4777                       && !deltanext     /* atom is fixed width */
4778                       && minnext != 0   /* CURLYM can't handle zero width */
4779
4780                          /* Nor characters whose fold at run-time may be
4781                           * multi-character */
4782                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4783                 ) {
4784                     /* XXXX How to optimize if data == 0? */
4785                     /* Optimize to a simpler form.  */
4786                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4787                     regnode *nxt2;
4788
4789                     OP(oscan) = CURLYM;
4790                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4791                             && (OP(nxt2) != WHILEM))
4792                         nxt = nxt2;
4793                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4794                     /* Need to optimize away parenths. */
4795                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4796                         /* Set the parenth number.  */
4797                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4798
4799                         oscan->flags = (U8)ARG(nxt);
4800                         if (RExC_open_parens) {
4801                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4802                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4803                         }
4804                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4805                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4806
4807 #ifdef DEBUGGING
4808                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4809                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4810                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4811                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4812 #endif
4813 #if 0
4814                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4815                             regnode *nnxt = regnext(nxt1);
4816                             if (nnxt == nxt) {
4817                                 if (reg_off_by_arg[OP(nxt1)])
4818                                     ARG_SET(nxt1, nxt2 - nxt1);
4819                                 else if (nxt2 - nxt1 < U16_MAX)
4820                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4821                                 else
4822                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4823                             }
4824                             nxt1 = nnxt;
4825                         }
4826 #endif
4827                         /* Optimize again: */
4828                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4829                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4830                     }
4831                     else
4832                         oscan->flags = 0;
4833                 }
4834                 else if ((OP(oscan) == CURLYX)
4835                          && (flags & SCF_WHILEM_VISITED_POS)
4836                          /* See the comment on a similar expression above.
4837                             However, this time it's not a subexpression
4838                             we care about, but the expression itself. */
4839                          && (maxcount == REG_INFTY)
4840                          && data && ++data->whilem_c < 16) {
4841                     /* This stays as CURLYX, we can put the count/of pair. */
4842                     /* Find WHILEM (as in regexec.c) */
4843                     regnode *nxt = oscan + NEXT_OFF(oscan);
4844
4845                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4846                         nxt += ARG(nxt);
4847                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4848                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4849                 }
4850                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4851                     pars++;
4852                 if (flags & SCF_DO_SUBSTR) {
4853                     SV *last_str = NULL;
4854                     STRLEN last_chrs = 0;
4855                     int counted = mincount != 0;
4856
4857                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4858                                                                   string. */
4859                         SSize_t b = pos_before >= data->last_start_min
4860                             ? pos_before : data->last_start_min;
4861                         STRLEN l;
4862                         const char * const s = SvPV_const(data->last_found, l);
4863                         SSize_t old = b - data->last_start_min;
4864
4865                         if (UTF)
4866                             old = utf8_hop((U8*)s, old) - (U8*)s;
4867                         l -= old;
4868                         /* Get the added string: */
4869                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4870                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4871                                             (U8*)(s + old + l)) : l;
4872                         if (deltanext == 0 && pos_before == b) {
4873                             /* What was added is a constant string */
4874                             if (mincount > 1) {
4875
4876                                 SvGROW(last_str, (mincount * l) + 1);
4877                                 repeatcpy(SvPVX(last_str) + l,
4878                                           SvPVX_const(last_str), l,
4879                                           mincount - 1);
4880                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4881                                 /* Add additional parts. */
4882                                 SvCUR_set(data->last_found,
4883                                           SvCUR(data->last_found) - l);
4884                                 sv_catsv(data->last_found, last_str);
4885                                 {
4886                                     SV * sv = data->last_found;
4887                                     MAGIC *mg =
4888                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4889                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4890                                     if (mg && mg->mg_len >= 0)
4891                                         mg->mg_len += last_chrs * (mincount-1);
4892                                 }
4893                                 last_chrs *= mincount;
4894                                 data->last_end += l * (mincount - 1);
4895                             }
4896                         } else {
4897                             /* start offset must point into the last copy */
4898                             data->last_start_min += minnext * (mincount - 1);
4899                             data->last_start_max += is_inf ? SSize_t_MAX
4900                                 : (maxcount - 1) * (minnext + data->pos_delta);
4901                         }
4902                     }
4903                     /* It is counted once already... */
4904                     data->pos_min += minnext * (mincount - counted);
4905 #if 0
4906 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4907                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4908                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4909     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4910     (UV)mincount);
4911 if (deltanext != SSize_t_MAX)
4912 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4913     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4914           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4915 #endif
4916                     if (deltanext == SSize_t_MAX
4917                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4918                         data->pos_delta = SSize_t_MAX;
4919                     else
4920                         data->pos_delta += - counted * deltanext +
4921                         (minnext + deltanext) * maxcount - minnext * mincount;
4922                     if (mincount != maxcount) {
4923                          /* Cannot extend fixed substrings found inside
4924                             the group.  */
4925                         scan_commit(pRExC_state, data, minlenp, is_inf);
4926                         if (mincount && last_str) {
4927                             SV * const sv = data->last_found;
4928                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4929                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4930
4931                             if (mg)
4932                                 mg->mg_len = -1;
4933                             sv_setsv(sv, last_str);
4934                             data->last_end = data->pos_min;
4935                             data->last_start_min = data->pos_min - last_chrs;
4936                             data->last_start_max = is_inf
4937                                 ? SSize_t_MAX
4938                                 : data->pos_min + data->pos_delta - last_chrs;
4939                         }
4940                         data->longest = &(data->longest_float);
4941                     }
4942                     SvREFCNT_dec(last_str);
4943                 }
4944                 if (data && (fl & SF_HAS_EVAL))
4945                     data->flags |= SF_HAS_EVAL;
4946               optimize_curly_tail:
4947                 if (OP(oscan) != CURLYX) {
4948                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4949                            && NEXT_OFF(next))
4950                         NEXT_OFF(oscan) += NEXT_OFF(next);
4951                 }
4952                 continue;
4953
4954             default:
4955 #ifdef DEBUGGING
4956                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4957                                                                     OP(scan));
4958 #endif
4959             case REF:
4960             case CLUMP:
4961                 if (flags & SCF_DO_SUBSTR) {
4962                     /* Cannot expect anything... */
4963                     scan_commit(pRExC_state, data, minlenp, is_inf);
4964                     data->longest = &(data->longest_float);
4965                 }
4966                 is_inf = is_inf_internal = 1;
4967                 if (flags & SCF_DO_STCLASS_OR) {
4968                     if (OP(scan) == CLUMP) {
4969                         /* Actually is any start char, but very few code points
4970                          * aren't start characters */
4971                         ssc_match_all_cp(data->start_class);
4972                     }
4973                     else {
4974                         ssc_anything(data->start_class);
4975                     }
4976                 }
4977                 flags &= ~SCF_DO_STCLASS;
4978                 break;
4979             }
4980         }
4981         else if (OP(scan) == LNBREAK) {
4982             if (flags & SCF_DO_STCLASS) {
4983                 if (flags & SCF_DO_STCLASS_AND) {
4984                     ssc_intersection(data->start_class,
4985                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4986                     ssc_clear_locale(data->start_class);
4987                     ANYOF_FLAGS(data->start_class)
4988                                                 &= ~SSC_MATCHES_EMPTY_STRING;
4989                 }
4990                 else if (flags & SCF_DO_STCLASS_OR) {
4991                     ssc_union(data->start_class,
4992                               PL_XPosix_ptrs[_CC_VERTSPACE],
4993                               FALSE);
4994                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4995
4996                     /* See commit msg for
4997                      * 749e076fceedeb708a624933726e7989f2302f6a */
4998                     ANYOF_FLAGS(data->start_class)
4999                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5000                 }
5001                 flags &= ~SCF_DO_STCLASS;
5002             }
5003             min++;
5004             delta++;    /* Because of the 2 char string cr-lf */
5005             if (flags & SCF_DO_SUBSTR) {
5006                 /* Cannot expect anything... */
5007                 scan_commit(pRExC_state, data, minlenp, is_inf);
5008                 data->pos_min += 1;
5009                 data->pos_delta += 1;
5010                 data->longest = &(data->longest_float);
5011             }
5012         }
5013         else if (REGNODE_SIMPLE(OP(scan))) {
5014
5015             if (flags & SCF_DO_SUBSTR) {
5016                 scan_commit(pRExC_state, data, minlenp, is_inf);
5017                 data->pos_min++;
5018             }
5019             min++;
5020             if (flags & SCF_DO_STCLASS) {
5021                 bool invert = 0;
5022                 SV* my_invlist = sv_2mortal(_new_invlist(0));
5023                 U8 namedclass;
5024
5025                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5026                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5027
5028                 /* Some of the logic below assumes that switching
5029                    locale on will only add false positives. */
5030                 switch (OP(scan)) {
5031
5032                 default:
5033 #ifdef DEBUGGING
5034                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5035                                                                      OP(scan));
5036 #endif
5037                 case CANY:
5038                 case SANY:
5039                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5040                         ssc_match_all_cp(data->start_class);
5041                     break;
5042
5043                 case REG_ANY:
5044                     {
5045                         SV* REG_ANY_invlist = _new_invlist(2);
5046                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5047                                                             '\n');
5048                         if (flags & SCF_DO_STCLASS_OR) {
5049                             ssc_union(data->start_class,
5050                                       REG_ANY_invlist,
5051                                       TRUE /* TRUE => invert, hence all but \n
5052                                             */
5053                                       );
5054                         }
5055                         else if (flags & SCF_DO_STCLASS_AND) {
5056                             ssc_intersection(data->start_class,
5057                                              REG_ANY_invlist,
5058                                              TRUE  /* TRUE => invert */
5059                                              );
5060                             ssc_clear_locale(data->start_class);
5061                         }
5062                         SvREFCNT_dec_NN(REG_ANY_invlist);
5063                     }
5064                     break;
5065
5066                 case ANYOF:
5067                     if (flags & SCF_DO_STCLASS_AND)
5068                         ssc_and(pRExC_state, data->start_class,
5069                                 (regnode_charclass *) scan);
5070                     else
5071                         ssc_or(pRExC_state, data->start_class,
5072                                                           (regnode_charclass *) scan);
5073                     break;
5074
5075                 case NPOSIXL:
5076                     invert = 1;
5077                     /* FALLTHROUGH */
5078
5079                 case POSIXL:
5080                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5081                     if (flags & SCF_DO_STCLASS_AND) {
5082                         bool was_there = cBOOL(
5083                                           ANYOF_POSIXL_TEST(data->start_class,
5084                                                                  namedclass));
5085                         ANYOF_POSIXL_ZERO(data->start_class);
5086                         if (was_there) {    /* Do an AND */
5087                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5088                         }
5089                         /* No individual code points can now match */
5090                         data->start_class->invlist
5091                                                 = sv_2mortal(_new_invlist(0));
5092                     }
5093                     else {
5094                         int complement = namedclass + ((invert) ? -1 : 1);
5095
5096                         assert(flags & SCF_DO_STCLASS_OR);
5097
5098                         /* If the complement of this class was already there,
5099                          * the result is that they match all code points,
5100                          * (\d + \D == everything).  Remove the classes from
5101                          * future consideration.  Locale is not relevant in
5102                          * this case */
5103                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5104                             ssc_match_all_cp(data->start_class);
5105                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5106                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5107                         }
5108                         else {  /* The usual case; just add this class to the
5109                                    existing set */
5110                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5111                         }
5112                     }
5113                     break;
5114
5115                 case NPOSIXA:   /* For these, we always know the exact set of
5116                                    what's matched */
5117                     invert = 1;
5118                     /* FALLTHROUGH */
5119                 case POSIXA:
5120                     if (FLAGS(scan) == _CC_ASCII) {
5121                         my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5122                     }
5123                     else {
5124                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5125                                               PL_XPosix_ptrs[_CC_ASCII],
5126                                               &my_invlist);
5127                     }
5128                     goto join_posix;
5129
5130                 case NPOSIXD:
5131                 case NPOSIXU:
5132                     invert = 1;
5133                     /* FALLTHROUGH */
5134                 case POSIXD:
5135                 case POSIXU:
5136                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5137
5138                     /* NPOSIXD matches all upper Latin1 code points unless the
5139                      * target string being matched is UTF-8, which is
5140                      * unknowable until match time.  Since we are going to
5141                      * invert, we want to get rid of all of them so that the
5142                      * inversion will match all */
5143                     if (OP(scan) == NPOSIXD) {
5144                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5145                                           &my_invlist);
5146                     }
5147
5148                   join_posix:
5149
5150                     if (flags & SCF_DO_STCLASS_AND) {
5151                         ssc_intersection(data->start_class, my_invlist, invert);
5152                         ssc_clear_locale(data->start_class);
5153                     }
5154                     else {
5155                         assert(flags & SCF_DO_STCLASS_OR);
5156                         ssc_union(data->start_class, my_invlist, invert);
5157                     }
5158                 }
5159                 if (flags & SCF_DO_STCLASS_OR)
5160                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5161                 flags &= ~SCF_DO_STCLASS;
5162             }
5163         }
5164         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5165             data->flags |= (OP(scan) == MEOL
5166                             ? SF_BEFORE_MEOL
5167                             : SF_BEFORE_SEOL);
5168             scan_commit(pRExC_state, data, minlenp, is_inf);
5169
5170         }
5171         else if (  PL_regkind[OP(scan)] == BRANCHJ
5172                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5173                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5174                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5175         {
5176             if ( OP(scan) == UNLESSM &&
5177                  scan->flags == 0 &&
5178                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5179                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5180             ) {
5181                 regnode *opt;
5182                 regnode *upto= regnext(scan);
5183                 DEBUG_PARSE_r({
5184                     SV * const mysv_val=sv_newmortal();
5185                     DEBUG_STUDYDATA("OPFAIL",data,depth);
5186
5187                     /*DEBUG_PARSE_MSG("opfail");*/
5188                     regprop(RExC_rx, mysv_val, upto, NULL);
5189                     PerlIO_printf(Perl_debug_log,
5190                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5191                         SvPV_nolen_const(mysv_val),
5192                         (IV)REG_NODE_NUM(upto),
5193                         (IV)(upto - scan)
5194                     );
5195                 });
5196                 OP(scan) = OPFAIL;
5197                 NEXT_OFF(scan) = upto - scan;
5198                 for (opt= scan + 1; opt < upto ; opt++)
5199                     OP(opt) = OPTIMIZED;
5200                 scan= upto;
5201                 continue;
5202             }
5203             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5204                 || OP(scan) == UNLESSM )
5205             {
5206                 /* Negative Lookahead/lookbehind
5207                    In this case we can't do fixed string optimisation.
5208                 */
5209
5210                 SSize_t deltanext, minnext, fake = 0;
5211                 regnode *nscan;
5212                 regnode_ssc intrnl;
5213                 int f = 0;
5214
5215                 data_fake.flags = 0;
5216                 if (data) {
5217                     data_fake.whilem_c = data->whilem_c;
5218                     data_fake.last_closep = data->last_closep;
5219                 }
5220                 else
5221                     data_fake.last_closep = &fake;
5222                 data_fake.pos_delta = delta;
5223                 if ( flags & SCF_DO_STCLASS && !scan->flags
5224                      && OP(scan) == IFMATCH ) { /* Lookahead */
5225                     ssc_init(pRExC_state, &intrnl);
5226                     data_fake.start_class = &intrnl;
5227                     f |= SCF_DO_STCLASS_AND;
5228                 }
5229                 if (flags & SCF_WHILEM_VISITED_POS)
5230                     f |= SCF_WHILEM_VISITED_POS;
5231                 next = regnext(scan);
5232                 nscan = NEXTOPER(NEXTOPER(scan));
5233                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5234                                       last, &data_fake, stopparen,
5235                                       recursed_depth, NULL, f, depth+1);
5236                 if (scan->flags) {
5237                     if (deltanext) {
5238                         FAIL("Variable length lookbehind not implemented");
5239                     }
5240                     else if (minnext > (I32)U8_MAX) {
5241                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5242                               (UV)U8_MAX);
5243                     }
5244                     scan->flags = (U8)minnext;
5245                 }
5246                 if (data) {
5247                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5248                         pars++;
5249                     if (data_fake.flags & SF_HAS_EVAL)
5250                         data->flags |= SF_HAS_EVAL;
5251                     data->whilem_c = data_fake.whilem_c;
5252                 }
5253                 if (f & SCF_DO_STCLASS_AND) {
5254                     if (flags & SCF_DO_STCLASS_OR) {
5255                         /* OR before, AND after: ideally we would recurse with
5256                          * data_fake to get the AND applied by study of the
5257                          * remainder of the pattern, and then derecurse;
5258                          * *** HACK *** for now just treat as "no information".
5259                          * See [perl #56690].
5260                          */
5261                         ssc_init(pRExC_state, data->start_class);
5262                     }  else {
5263                         /* AND before and after: combine and continue.  These
5264                          * assertions are zero-length, so can match an EMPTY
5265                          * string */
5266                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5267                         ANYOF_FLAGS(data->start_class)
5268                                                    |= SSC_MATCHES_EMPTY_STRING;
5269                     }
5270                 }
5271             }
5272 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5273             else {
5274                 /* Positive Lookahead/lookbehind
5275                    In this case we can do fixed string optimisation,
5276                    but we must be careful about it. Note in the case of
5277                    lookbehind the positions will be offset by the minimum
5278                    length of the pattern, something we won't know about
5279                    until after the recurse.
5280                 */
5281                 SSize_t deltanext, fake = 0;
5282                 regnode *nscan;
5283                 regnode_ssc intrnl;
5284                 int f = 0;
5285                 /* We use SAVEFREEPV so that when the full compile
5286                     is finished perl will clean up the allocated
5287                     minlens when it's all done. This way we don't
5288                     have to worry about freeing them when we know
5289                     they wont be used, which would be a pain.
5290                  */
5291                 SSize_t *minnextp;
5292                 Newx( minnextp, 1, SSize_t );
5293                 SAVEFREEPV(minnextp);
5294
5295                 if (data) {
5296                     StructCopy(data, &data_fake, scan_data_t);
5297                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5298                         f |= SCF_DO_SUBSTR;
5299                         if (scan->flags)
5300                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5301                         data_fake.last_found=newSVsv(data->last_found);
5302                     }
5303                 }
5304                 else
5305                     data_fake.last_closep = &fake;
5306                 data_fake.flags = 0;
5307                 data_fake.pos_delta = delta;
5308                 if (is_inf)
5309                     data_fake.flags |= SF_IS_INF;
5310                 if ( flags & SCF_DO_STCLASS && !scan->flags
5311                      && OP(scan) == IFMATCH ) { /* Lookahead */
5312                     ssc_init(pRExC_state, &intrnl);
5313                     data_fake.start_class = &intrnl;
5314                     f |= SCF_DO_STCLASS_AND;
5315                 }
5316                 if (flags & SCF_WHILEM_VISITED_POS)
5317                     f |= SCF_WHILEM_VISITED_POS;
5318                 next = regnext(scan);
5319                 nscan = NEXTOPER(NEXTOPER(scan));
5320
5321                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5322                                         &deltanext, last, &data_fake,
5323                                         stopparen, recursed_depth, NULL,
5324                                         f,depth+1);
5325                 if (scan->flags) {
5326                     if (deltanext) {
5327                         FAIL("Variable length lookbehind not implemented");
5328                     }
5329                     else if (*minnextp > (I32)U8_MAX) {
5330                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5331                               (UV)U8_MAX);
5332                     }
5333                     scan->flags = (U8)*minnextp;
5334                 }
5335
5336                 *minnextp += min;
5337
5338                 if (f & SCF_DO_STCLASS_AND) {
5339                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5340                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5341                 }
5342                 if (data) {
5343                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5344                         pars++;
5345                     if (data_fake.flags & SF_HAS_EVAL)
5346                         data->flags |= SF_HAS_EVAL;
5347                     data->whilem_c = data_fake.whilem_c;
5348                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5349                         if (RExC_rx->minlen<*minnextp)
5350                             RExC_rx->minlen=*minnextp;
5351                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5352                         SvREFCNT_dec_NN(data_fake.last_found);
5353
5354                         if ( data_fake.minlen_fixed != minlenp )
5355                         {
5356                             data->offset_fixed= data_fake.offset_fixed;
5357                             data->minlen_fixed= data_fake.minlen_fixed;
5358                             data->lookbehind_fixed+= scan->flags;
5359                         }
5360                         if ( data_fake.minlen_float != minlenp )
5361                         {
5362                             data->minlen_float= data_fake.minlen_float;
5363                             data->offset_float_min=data_fake.offset_float_min;
5364                             data->offset_float_max=data_fake.offset_float_max;
5365                             data->lookbehind_float+= scan->flags;
5366                         }
5367                     }
5368                 }
5369             }
5370 #endif
5371         }
5372         else if (OP(scan) == OPEN) {
5373             if (stopparen != (I32)ARG(scan))
5374                 pars++;
5375         }
5376         else if (OP(scan) == CLOSE) {
5377             if (stopparen == (I32)ARG(scan)) {
5378                 break;
5379             }
5380             if ((I32)ARG(scan) == is_par) {
5381                 next = regnext(scan);
5382
5383                 if ( next && (OP(next) != WHILEM) && next < last)
5384                     is_par = 0;         /* Disable optimization */
5385             }
5386             if (data)
5387                 *(data->last_closep) = ARG(scan);
5388         }
5389         else if (OP(scan) == EVAL) {
5390                 if (data)
5391                     data->flags |= SF_HAS_EVAL;
5392         }
5393         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5394             if (flags & SCF_DO_SUBSTR) {
5395                 scan_commit(pRExC_state, data, minlenp, is_inf);
5396                 flags &= ~SCF_DO_SUBSTR;
5397             }
5398             if (data && OP(scan)==ACCEPT) {
5399                 data->flags |= SCF_SEEN_ACCEPT;
5400                 if (stopmin > min)
5401                     stopmin = min;
5402             }
5403         }
5404         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5405         {
5406                 if (flags & SCF_DO_SUBSTR) {
5407                     scan_commit(pRExC_state, data, minlenp, is_inf);
5408                     data->longest = &(data->longest_float);
5409                 }
5410                 is_inf = is_inf_internal = 1;
5411                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5412                     ssc_anything(data->start_class);
5413                 flags &= ~SCF_DO_STCLASS;
5414         }
5415         else if (OP(scan) == GPOS) {
5416             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5417                 !(delta || is_inf || (data && data->pos_delta)))
5418             {
5419                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5420                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5421                 if (RExC_rx->gofs < (STRLEN)min)
5422                     RExC_rx->gofs = min;
5423             } else {
5424                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5425                 RExC_rx->gofs = 0;
5426             }
5427         }
5428 #ifdef TRIE_STUDY_OPT
5429 #ifdef FULL_TRIE_STUDY
5430         else if (PL_regkind[OP(scan)] == TRIE) {
5431             /* NOTE - There is similar code to this block above for handling
5432                BRANCH nodes on the initial study.  If you change stuff here
5433                check there too. */
5434             regnode *trie_node= scan;
5435             regnode *tail= regnext(scan);
5436             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5437             SSize_t max1 = 0, min1 = SSize_t_MAX;
5438             regnode_ssc accum;
5439
5440             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5441                 /* Cannot merge strings after this. */
5442                 scan_commit(pRExC_state, data, minlenp, is_inf);
5443             }
5444             if (flags & SCF_DO_STCLASS)
5445                 ssc_init_zero(pRExC_state, &accum);
5446
5447             if (!trie->jump) {
5448                 min1= trie->minlen;
5449                 max1= trie->maxlen;
5450             } else {
5451                 const regnode *nextbranch= NULL;
5452                 U32 word;
5453
5454                 for ( word=1 ; word <= trie->wordcount ; word++)
5455                 {
5456                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5457                     regnode_ssc this_class;
5458
5459                     data_fake.flags = 0;
5460                     if (data) {
5461                         data_fake.whilem_c = data->whilem_c;
5462                         data_fake.last_closep = data->last_closep;
5463                     }
5464                     else
5465                         data_fake.last_closep = &fake;
5466                     data_fake.pos_delta = delta;
5467                     if (flags & SCF_DO_STCLASS) {
5468                         ssc_init(pRExC_state, &this_class);
5469                         data_fake.start_class = &this_class;
5470                         f = SCF_DO_STCLASS_AND;
5471                     }
5472                     if (flags & SCF_WHILEM_VISITED_POS)
5473                         f |= SCF_WHILEM_VISITED_POS;
5474
5475                     if (trie->jump[word]) {
5476                         if (!nextbranch)
5477                             nextbranch = trie_node + trie->jump[0];
5478                         scan= trie_node + trie->jump[word];
5479                         /* We go from the jump point to the branch that follows
5480                            it. Note this means we need the vestigal unused
5481                            branches even though they arent otherwise used. */
5482                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5483                             &deltanext, (regnode *)nextbranch, &data_fake,
5484                             stopparen, recursed_depth, NULL, f,depth+1);
5485                     }
5486                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5487                         nextbranch= regnext((regnode*)nextbranch);
5488
5489                     if (min1 > (SSize_t)(minnext + trie->minlen))
5490                         min1 = minnext + trie->minlen;
5491                     if (deltanext == SSize_t_MAX) {
5492                         is_inf = is_inf_internal = 1;
5493                         max1 = SSize_t_MAX;
5494                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5495                         max1 = minnext + deltanext + trie->maxlen;
5496
5497                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5498                         pars++;
5499                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5500                         if ( stopmin > min + min1)
5501                             stopmin = min + min1;
5502                         flags &= ~SCF_DO_SUBSTR;
5503                         if (data)
5504                             data->flags |= SCF_SEEN_ACCEPT;
5505                     }
5506                     if (data) {
5507                         if (data_fake.flags & SF_HAS_EVAL)
5508                             data->flags |= SF_HAS_EVAL;
5509                         data->whilem_c = data_fake.whilem_c;
5510                     }
5511                     if (flags & SCF_DO_STCLASS)
5512                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5513                 }
5514             }
5515             if (flags & SCF_DO_SUBSTR) {
5516                 data->pos_min += min1;
5517                 data->pos_delta += max1 - min1;
5518                 if (max1 != min1 || is_inf)
5519                     data->longest = &(data->longest_float);
5520             }
5521             min += min1;
5522             delta += max1 - min1;
5523             if (flags & SCF_DO_STCLASS_OR) {
5524                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5525                 if (min1) {
5526                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5527                     flags &= ~SCF_DO_STCLASS;
5528                 }
5529             }
5530             else if (flags & SCF_DO_STCLASS_AND) {
5531                 if (min1) {
5532                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5533                     flags &= ~SCF_DO_STCLASS;
5534                 }
5535                 else {
5536                     /* Switch to OR mode: cache the old value of
5537                      * data->start_class */
5538                     INIT_AND_WITHP;
5539                     StructCopy(data->start_class, and_withp, regnode_ssc);
5540                     flags &= ~SCF_DO_STCLASS_AND;
5541                     StructCopy(&accum, data->start_class, regnode_ssc);
5542                     flags |= SCF_DO_STCLASS_OR;
5543                 }
5544             }
5545             scan= tail;
5546             continue;
5547         }
5548 #else
5549         else if (PL_regkind[OP(scan)] == TRIE) {
5550             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5551             U8*bang=NULL;
5552
5553             min += trie->minlen;
5554             delta += (trie->maxlen - trie->minlen);
5555             flags &= ~SCF_DO_STCLASS; /* xxx */
5556             if (flags & SCF_DO_SUBSTR) {
5557                 /* Cannot expect anything... */
5558                 scan_commit(pRExC_state, data, minlenp, is_inf);
5559                 data->pos_min += trie->minlen;
5560                 data->pos_delta += (trie->maxlen - trie->minlen);
5561                 if (trie->maxlen != trie->minlen)
5562                     data->longest = &(data->longest_float);
5563             }
5564             if (trie->jump) /* no more substrings -- for now /grr*/
5565                flags &= ~SCF_DO_SUBSTR;
5566         }
5567 #endif /* old or new */
5568 #endif /* TRIE_STUDY_OPT */
5569
5570         /* Else: zero-length, ignore. */
5571         scan = regnext(scan);
5572     }
5573     /* If we are exiting a recursion we can unset its recursed bit
5574      * and allow ourselves to enter it again - no danger of an
5575      * infinite loop there.
5576     if (stopparen > -1 && recursed) {
5577         DEBUG_STUDYDATA("unset:", data,depth);
5578         PAREN_UNSET( recursed, stopparen);
5579     }
5580     */
5581     if (frame) {
5582         DEBUG_STUDYDATA("frame-end:",data,depth);
5583         DEBUG_PEEP("fend", scan, depth);
5584         /* restore previous context */
5585         last = frame->last;
5586         scan = frame->next;
5587         stopparen = frame->stop;
5588         recursed_depth = frame->prev_recursed_depth;
5589         depth = depth - 1;
5590
5591         frame = frame->prev;
5592         goto fake_study_recurse;
5593     }
5594
5595   finish:
5596     assert(!frame);
5597     DEBUG_STUDYDATA("pre-fin:",data,depth);
5598
5599     *scanp = scan;
5600     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5601
5602     if (flags & SCF_DO_SUBSTR && is_inf)
5603         data->pos_delta = SSize_t_MAX - data->pos_min;
5604     if (is_par > (I32)U8_MAX)
5605         is_par = 0;
5606     if (is_par && pars==1 && data) {
5607         data->flags |= SF_IN_PAR;
5608         data->flags &= ~SF_HAS_PAR;
5609     }
5610     else if (pars && data) {
5611         data->flags |= SF_HAS_PAR;
5612         data->flags &= ~SF_IN_PAR;
5613     }
5614     if (flags & SCF_DO_STCLASS_OR)
5615         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5616     if (flags & SCF_TRIE_RESTUDY)
5617         data->flags |=  SCF_TRIE_RESTUDY;
5618
5619     DEBUG_STUDYDATA("post-fin:",data,depth);
5620
5621     {
5622         SSize_t final_minlen= min < stopmin ? min : stopmin;
5623
5624         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5625             RExC_maxlen = final_minlen + delta;
5626         }
5627         return final_minlen;
5628     }
5629     /* not-reached */
5630 }
5631
5632 STATIC U32
5633 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5634 {
5635     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5636
5637     PERL_ARGS_ASSERT_ADD_DATA;
5638
5639     Renewc(RExC_rxi->data,
5640            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5641            char, struct reg_data);
5642     if(count)
5643         Renew(RExC_rxi->data->what, count + n, U8);
5644     else
5645         Newx(RExC_rxi->data->what, n, U8);
5646     RExC_rxi->data->count = count + n;
5647     Copy(s, RExC_rxi->data->what + count, n, U8);
5648     return count;
5649 }
5650
5651 /*XXX: todo make this not included in a non debugging perl, but appears to be
5652  * used anyway there, in 'use re' */
5653 #ifndef PERL_IN_XSUB_RE
5654 void
5655 Perl_reginitcolors(pTHX)
5656 {
5657     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5658     if (s) {
5659         char *t = savepv(s);
5660         int i = 0;
5661         PL_colors[0] = t;
5662         while (++i < 6) {
5663             t = strchr(t, '\t');
5664             if (t) {
5665                 *t = '\0';
5666                 PL_colors[i] = ++t;
5667             }
5668             else
5669                 PL_colors[i] = t = (char *)"";
5670         }
5671     } else {
5672         int i = 0;
5673         while (i < 6)
5674             PL_colors[i++] = (char *)"";
5675     }
5676     PL_colorset = 1;
5677 }
5678 #endif
5679
5680
5681 #ifdef TRIE_STUDY_OPT
5682 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5683     STMT_START {                                            \
5684         if (                                                \
5685               (data.flags & SCF_TRIE_RESTUDY)               \
5686               && ! restudied++                              \
5687         ) {                                                 \
5688             dOsomething;                                    \
5689             goto reStudy;                                   \
5690         }                                                   \
5691     } STMT_END
5692 #else
5693 #define CHECK_RESTUDY_GOTO_butfirst
5694 #endif
5695
5696 /*
5697  * pregcomp - compile a regular expression into internal code
5698  *
5699  * Decides which engine's compiler to call based on the hint currently in
5700  * scope
5701  */
5702
5703 #ifndef PERL_IN_XSUB_RE
5704
5705 /* return the currently in-scope regex engine (or the default if none)  */
5706
5707 regexp_engine const *
5708 Perl_current_re_engine(pTHX)
5709 {
5710     if (IN_PERL_COMPILETIME) {
5711         HV * const table = GvHV(PL_hintgv);
5712         SV **ptr;
5713
5714         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5715             return &PL_core_reg_engine;
5716         ptr = hv_fetchs(table, "regcomp", FALSE);
5717         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5718             return &PL_core_reg_engine;
5719         return INT2PTR(regexp_engine*,SvIV(*ptr));
5720     }
5721     else {
5722         SV *ptr;
5723         if (!PL_curcop->cop_hints_hash)
5724             return &PL_core_reg_engine;
5725         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5726         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5727             return &PL_core_reg_engine;
5728         return INT2PTR(regexp_engine*,SvIV(ptr));
5729     }
5730 }
5731
5732
5733 REGEXP *
5734 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5735 {
5736     regexp_engine const *eng = current_re_engine();
5737     GET_RE_DEBUG_FLAGS_DECL;
5738
5739     PERL_ARGS_ASSERT_PREGCOMP;
5740
5741     /* Dispatch a request to compile a regexp to correct regexp engine. */
5742     DEBUG_COMPILE_r({
5743         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5744                         PTR2UV(eng));
5745     });
5746     return CALLREGCOMP_ENG(eng, pattern, flags);
5747 }
5748 #endif
5749
5750 /* public(ish) entry point for the perl core's own regex compiling code.
5751  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5752  * pattern rather than a list of OPs, and uses the internal engine rather
5753  * than the current one */
5754
5755 REGEXP *
5756 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5757 {
5758     SV *pat = pattern; /* defeat constness! */
5759     PERL_ARGS_ASSERT_RE_COMPILE;
5760     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5761 #ifdef PERL_IN_XSUB_RE
5762                                 &my_reg_engine,
5763 #else
5764                                 &PL_core_reg_engine,
5765 #endif
5766                                 NULL, NULL, rx_flags, 0);
5767 }
5768
5769
5770 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5771  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5772  * point to the realloced string and length.
5773  *
5774  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5775  * stuff added */
5776
5777 static void
5778 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5779                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5780 {
5781     U8 *const src = (U8*)*pat_p;
5782     U8 *dst, *d;
5783     int n=0;
5784     STRLEN s = 0;
5785     bool do_end = 0;
5786     GET_RE_DEBUG_FLAGS_DECL;
5787
5788     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5789         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5790
5791     Newx(dst, *plen_p * 2 + 1, U8);
5792     d = dst;
5793
5794     while (s < *plen_p) {
5795         append_utf8_from_native_byte(src[s], &d);
5796         if (n < num_code_blocks) {
5797             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5798                 pRExC_state->code_blocks[n].start = d - dst - 1;
5799                 assert(*(d - 1) == '(');
5800                 do_end = 1;
5801             }
5802             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5803                 pRExC_state->code_blocks[n].end = d - dst - 1;
5804                 assert(*(d - 1) == ')');
5805                 do_end = 0;
5806                 n++;
5807             }
5808         }
5809         s++;
5810     }
5811     *d = '\0';
5812     *plen_p = d - dst;
5813     *pat_p = (char*) dst;
5814     SAVEFREEPV(*pat_p);
5815     RExC_orig_utf8 = RExC_utf8 = 1;
5816 }
5817
5818
5819
5820 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5821  * while recording any code block indices, and handling overloading,
5822  * nested qr// objects etc.  If pat is null, it will allocate a new
5823  * string, or just return the first arg, if there's only one.
5824  *
5825  * Returns the malloced/updated pat.
5826  * patternp and pat_count is the array of SVs to be concatted;
5827  * oplist is the optional list of ops that generated the SVs;
5828  * recompile_p is a pointer to a boolean that will be set if
5829  *   the regex will need to be recompiled.
5830  * delim, if non-null is an SV that will be inserted between each element
5831  */
5832
5833 static SV*
5834 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5835                 SV *pat, SV ** const patternp, int pat_count,
5836                 OP *oplist, bool *recompile_p, SV *delim)
5837 {
5838     SV **svp;
5839     int n = 0;
5840     bool use_delim = FALSE;
5841     bool alloced = FALSE;
5842
5843     /* if we know we have at least two args, create an empty string,
5844      * then concatenate args to that. For no args, return an empty string */
5845     if (!pat && pat_count != 1) {
5846         pat = newSVpvs("");
5847         SAVEFREESV(pat);
5848         alloced = TRUE;
5849     }
5850
5851     for (svp = patternp; svp < patternp + pat_count; svp++) {
5852         SV *sv;
5853         SV *rx  = NULL;
5854         STRLEN orig_patlen = 0;
5855         bool code = 0;
5856         SV *msv = use_delim ? delim : *svp;
5857         if (!msv) msv = &PL_sv_undef;
5858
5859         /* if we've got a delimiter, we go round the loop twice for each
5860          * svp slot (except the last), using the delimiter the second
5861          * time round */
5862         if (use_delim) {
5863             svp--;
5864             use_delim = FALSE;
5865         }
5866         else if (delim)
5867             use_delim = TRUE;
5868
5869         if (SvTYPE(msv) == SVt_PVAV) {
5870             /* we've encountered an interpolated array within
5871              * the pattern, e.g. /...@a..../. Expand the list of elements,
5872              * then recursively append elements.
5873              * The code in this block is based on S_pushav() */
5874
5875             AV *const av = (AV*)msv;
5876             const SSize_t maxarg = AvFILL(av) + 1;
5877             SV **array;
5878
5879             if (oplist) {
5880                 assert(oplist->op_type == OP_PADAV
5881                     || oplist->op_type == OP_RV2AV);
5882                 oplist = OP_SIBLING(oplist);
5883             }
5884
5885             if (SvRMAGICAL(av)) {
5886                 SSize_t i;
5887
5888                 Newx(array, maxarg, SV*);
5889                 SAVEFREEPV(array);
5890                 for (i=0; i < maxarg; i++) {
5891                     SV ** const svp = av_fetch(av, i, FALSE);
5892                     array[i] = svp ? *svp : &PL_sv_undef;
5893                 }
5894             }
5895             else
5896                 array = AvARRAY(av);
5897
5898             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5899                                 array, maxarg, NULL, recompile_p,
5900                                 /* $" */
5901                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5902
5903             continue;
5904         }
5905
5906
5907         /* we make the assumption here that each op in the list of
5908          * op_siblings maps to one SV pushed onto the stack,
5909          * except for code blocks, with have both an OP_NULL and
5910          * and OP_CONST.
5911          * This allows us to match up the list of SVs against the
5912          * list of OPs to find the next code block.
5913          *
5914          * Note that       PUSHMARK PADSV PADSV ..
5915          * is optimised to
5916          *                 PADRANGE PADSV  PADSV  ..
5917          * so the alignment still works. */
5918
5919         if (oplist) {
5920             if (oplist->op_type == OP_NULL
5921                 && (oplist->op_flags & OPf_SPECIAL))
5922             {
5923                 assert(n < pRExC_state->num_code_blocks);
5924                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5925                 pRExC_state->code_blocks[n].block = oplist;
5926                 pRExC_state->code_blocks[n].src_regex = NULL;
5927                 n++;
5928                 code = 1;
5929                 oplist = OP_SIBLING(oplist); /* skip CONST */
5930                 assert(oplist);
5931             }
5932             oplist = OP_SIBLING(oplist);;
5933         }
5934
5935         /* apply magic and QR overloading to arg */
5936
5937         SvGETMAGIC(msv);
5938         if (SvROK(msv) && SvAMAGIC(msv)) {
5939             SV *sv = AMG_CALLunary(msv, regexp_amg);
5940             if (sv) {
5941                 if (SvROK(sv))
5942                     sv = SvRV(sv);
5943                 if (SvTYPE(sv) != SVt_REGEXP)
5944                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5945                 msv = sv;
5946             }
5947         }
5948
5949         /* try concatenation overload ... */
5950         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5951                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5952         {
5953             sv_setsv(pat, sv);
5954             /* overloading involved: all bets are off over literal
5955              * code. Pretend we haven't seen it */
5956             pRExC_state->num_code_blocks -= n;
5957             n = 0;
5958         }
5959         else  {
5960             /* ... or failing that, try "" overload */
5961             while (SvAMAGIC(msv)
5962                     && (sv = AMG_CALLunary(msv, string_amg))
5963                     && sv != msv
5964                     &&  !(   SvROK(msv)
5965                           && SvROK(sv)
5966                           && SvRV(msv) == SvRV(sv))
5967             ) {
5968                 msv = sv;
5969                 SvGETMAGIC(msv);
5970             }
5971             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5972                 msv = SvRV(msv);
5973
5974             if (pat) {
5975                 /* this is a partially unrolled
5976                  *     sv_catsv_nomg(pat, msv);
5977                  * that allows us to adjust code block indices if
5978                  * needed */
5979                 STRLEN dlen;
5980                 char *dst = SvPV_force_nomg(pat, dlen);
5981                 orig_patlen = dlen;
5982                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5983                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5984                     sv_setpvn(pat, dst, dlen);
5985                     SvUTF8_on(pat);
5986                 }
5987                 sv_catsv_nomg(pat, msv);
5988                 rx = msv;
5989             }
5990             else
5991                 pat = msv;
5992
5993             if (code)
5994                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5995         }
5996
5997         /* extract any code blocks within any embedded qr//'s */
5998         if (rx && SvTYPE(rx) == SVt_REGEXP
5999             && RX_ENGINE((REGEXP*)rx)->op_comp)
6000         {
6001
6002             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6003             if (ri->num_code_blocks) {
6004                 int i;
6005                 /* the presence of an embedded qr// with code means
6006                  * we should always recompile: the text of the
6007                  * qr// may not have changed, but it may be a
6008                  * different closure than last time */
6009                 *recompile_p = 1;
6010                 Renew(pRExC_state->code_blocks,
6011                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6012                     struct reg_code_block);
6013                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6014
6015                 for (i=0; i < ri->num_code_blocks; i++) {
6016                     struct reg_code_block *src, *dst;
6017                     STRLEN offset =  orig_patlen
6018                         + ReANY((REGEXP *)rx)->pre_prefix;
6019                     assert(n < pRExC_state->num_code_blocks);
6020                     src = &ri->code_blocks[i];
6021                     dst = &pRExC_state->code_blocks[n];
6022                     dst->start      = src->start + offset;
6023                     dst->end        = src->end   + offset;
6024                     dst->block      = src->block;
6025                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6026                                             src->src_regex
6027                                                 ? src->src_regex
6028                                                 : (REGEXP*)rx);
6029                     n++;
6030                 }
6031             }
6032         }
6033     }
6034     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6035     if (alloced)
6036         SvSETMAGIC(pat);
6037
6038     return pat;
6039 }
6040
6041
6042
6043 /* see if there are any run-time code blocks in the pattern.
6044  * False positives are allowed */
6045
6046 static bool
6047 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6048                     char *pat, STRLEN plen)
6049 {
6050     int n = 0;
6051     STRLEN s;
6052
6053     PERL_UNUSED_CONTEXT;
6054
6055     for (s = 0; s < plen; s++) {
6056         if (n < pRExC_state->num_code_blocks
6057             && s == pRExC_state->code_blocks[n].start)
6058         {
6059             s = pRExC_state->code_blocks[n].end;
6060             n++;
6061             continue;
6062         }
6063         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6064          * positives here */
6065         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6066             (pat[s+2] == '{'
6067                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6068         )
6069             return 1;
6070     }
6071     return 0;
6072 }
6073
6074 /* Handle run-time code blocks. We will already have compiled any direct
6075  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6076  * copy of it, but with any literal code blocks blanked out and
6077  * appropriate chars escaped; then feed it into
6078  *
6079  *    eval "qr'modified_pattern'"
6080  *
6081  * For example,
6082  *
6083  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6084  *
6085  * becomes
6086  *
6087  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6088  *
6089  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6090  * and merge them with any code blocks of the original regexp.
6091  *
6092  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6093  * instead, just save the qr and return FALSE; this tells our caller that
6094  * the original pattern needs upgrading to utf8.
6095  */
6096
6097 static bool
6098 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6099     char *pat, STRLEN plen)
6100 {
6101     SV *qr;
6102
6103     GET_RE_DEBUG_FLAGS_DECL;
6104
6105     if (pRExC_state->runtime_code_qr) {
6106         /* this is the second time we've been called; this should
6107          * only happen if the main pattern got upgraded to utf8
6108          * during compilation; re-use the qr we compiled first time
6109          * round (which should be utf8 too)
6110          */
6111         qr = pRExC_state->runtime_code_qr;
6112         pRExC_state->runtime_code_qr = NULL;
6113         assert(RExC_utf8 && SvUTF8(qr));
6114     }
6115     else {
6116         int n = 0;
6117         STRLEN s;
6118         char *p, *newpat;
6119         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6120         SV *sv, *qr_ref;
6121         dSP;
6122
6123         /* determine how many extra chars we need for ' and \ escaping */
6124         for (s = 0; s < plen; s++) {
6125             if (pat[s] == '\'' || pat[s] == '\\')
6126                 newlen++;
6127         }
6128
6129         Newx(newpat, newlen, char);
6130         p = newpat;
6131         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6132
6133         for (s = 0; s < plen; s++) {
6134             if (n < pRExC_state->num_code_blocks
6135                 && s == pRExC_state->code_blocks[n].start)
6136             {
6137                 /* blank out literal code block */
6138                 assert(pat[s] == '(');
6139                 while (s <= pRExC_state->code_blocks[n].end) {
6140                     *p++ = '_';
6141                     s++;
6142                 }
6143                 s--;
6144                 n++;
6145                 continue;
6146             }
6147             if (pat[s] == '\'' || pat[s] == '\\')
6148                 *p++ = '\\';
6149             *p++ = pat[s];
6150         }
6151         *p++ = '\'';
6152         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6153             *p++ = 'x';
6154         *p++ = '\0';
6155         DEBUG_COMPILE_r({
6156             PerlIO_printf(Perl_debug_log,
6157                 "%sre-parsing pattern for runtime code:%s %s\n",
6158                 PL_colors[4],PL_colors[5],newpat);
6159         });
6160
6161         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6162         Safefree(newpat);
6163
6164         ENTER;
6165         SAVETMPS;
6166         PUSHSTACKi(PERLSI_REQUIRE);
6167         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6168          * parsing qr''; normally only q'' does this. It also alters
6169          * hints handling */
6170         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6171         SvREFCNT_dec_NN(sv);
6172         SPAGAIN;
6173         qr_ref = POPs;
6174         PUTBACK;
6175         {
6176             SV * const errsv = ERRSV;
6177             if (SvTRUE_NN(errsv))
6178             {
6179                 Safefree(pRExC_state->code_blocks);
6180                 /* use croak_sv ? */
6181                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6182             }
6183         }
6184         assert(SvROK(qr_ref));
6185         qr = SvRV(qr_ref);
6186         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6187         /* the leaving below frees the tmp qr_ref.
6188          * Give qr a life of its own */
6189         SvREFCNT_inc(qr);
6190         POPSTACK;
6191         FREETMPS;
6192         LEAVE;
6193
6194     }
6195
6196     if (!RExC_utf8 && SvUTF8(qr)) {
6197         /* first time through; the pattern got upgraded; save the
6198          * qr for the next time through */
6199         assert(!pRExC_state->runtime_code_qr);
6200         pRExC_state->runtime_code_qr = qr;
6201         return 0;
6202     }
6203
6204
6205     /* extract any code blocks within the returned qr//  */
6206
6207
6208     /* merge the main (r1) and run-time (r2) code blocks into one */
6209     {
6210         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6211         struct reg_code_block *new_block, *dst;
6212         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6213         int i1 = 0, i2 = 0;
6214
6215         if (!r2->num_code_blocks) /* we guessed wrong */
6216         {
6217             SvREFCNT_dec_NN(qr);
6218             return 1;
6219         }
6220
6221         Newx(new_block,
6222             r1->num_code_blocks + r2->num_code_blocks,
6223             struct reg_code_block);
6224         dst = new_block;
6225
6226         while (    i1 < r1->num_code_blocks
6227                 || i2 < r2->num_code_blocks)
6228         {
6229             struct reg_code_block *src;
6230             bool is_qr = 0;
6231
6232             if (i1 == r1->num_code_blocks) {
6233                 src = &r2->code_blocks[i2++];
6234                 is_qr = 1;
6235             }
6236             else if (i2 == r2->num_code_blocks)
6237                 src = &r1->code_blocks[i1++];
6238             else if (  r1->code_blocks[i1].start
6239                      < r2->code_blocks[i2].start)
6240             {
6241                 src = &r1->code_blocks[i1++];
6242                 assert(src->end < r2->code_blocks[i2].start);
6243             }
6244             else {
6245                 assert(  r1->code_blocks[i1].start
6246                        > r2->code_blocks[i2].start);
6247                 src = &r2->code_blocks[i2++];
6248                 is_qr = 1;
6249                 assert(src->end < r1->code_blocks[i1].start);
6250             }
6251
6252             assert(pat[src->start] == '(');
6253             assert(pat[src->end]   == ')');
6254             dst->start      = src->start;
6255             dst->end        = src->end;
6256             dst->block      = src->block;
6257             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6258                                     : src->src_regex;
6259             dst++;
6260         }
6261         r1->num_code_blocks += r2->num_code_blocks;
6262         Safefree(r1->code_blocks);
6263         r1->code_blocks = new_block;
6264     }
6265
6266     SvREFCNT_dec_NN(qr);
6267     return 1;
6268 }
6269
6270
6271 STATIC bool
6272 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6273                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6274                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6275                       STRLEN longest_length, bool eol, bool meol)
6276 {
6277     /* This is the common code for setting up the floating and fixed length
6278      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6279      * as to whether succeeded or not */
6280
6281     I32 t;
6282     SSize_t ml;
6283
6284     if (! (longest_length
6285            || (eol /* Can't have SEOL and MULTI */
6286                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6287           )
6288             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6289         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6290     {
6291         return FALSE;
6292     }
6293
6294     /* copy the information about the longest from the reg_scan_data
6295         over to the program. */
6296     if (SvUTF8(sv_longest)) {
6297         *rx_utf8 = sv_longest;
6298         *rx_substr = NULL;
6299     } else {
6300         *rx_substr = sv_longest;
6301         *rx_utf8 = NULL;
6302     }
6303     /* end_shift is how many chars that must be matched that
6304         follow this item. We calculate it ahead of time as once the
6305         lookbehind offset is added in we lose the ability to correctly
6306         calculate it.*/
6307     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6308     *rx_end_shift = ml - offset
6309         - longest_length + (SvTAIL(sv_longest) != 0)
6310         + lookbehind;
6311
6312     t = (eol/* Can't have SEOL and MULTI */
6313          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6314     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6315
6316     return TRUE;
6317 }
6318
6319 /*
6320  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6321  * regular expression into internal code.
6322  * The pattern may be passed either as:
6323  *    a list of SVs (patternp plus pat_count)
6324  *    a list of OPs (expr)
6325  * If both are passed, the SV list is used, but the OP list indicates
6326  * which SVs are actually pre-compiled code blocks
6327  *
6328  * The SVs in the list have magic and qr overloading applied to them (and
6329  * the list may be modified in-place with replacement SVs in the latter
6330  * case).
6331  *
6332  * If the pattern hasn't changed from old_re, then old_re will be
6333  * returned.
6334  *
6335  * eng is the current engine. If that engine has an op_comp method, then
6336  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6337  * do the initial concatenation of arguments and pass on to the external
6338  * engine.
6339  *
6340  * If is_bare_re is not null, set it to a boolean indicating whether the
6341  * arg list reduced (after overloading) to a single bare regex which has
6342  * been returned (i.e. /$qr/).
6343  *
6344  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6345  *
6346  * pm_flags contains the PMf_* flags, typically based on those from the
6347  * pm_flags field of the related PMOP. Currently we're only interested in
6348  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6349  *
6350  * We can't allocate space until we know how big the compiled form will be,
6351  * but we can't compile it (and thus know how big it is) until we've got a
6352  * place to put the code.  So we cheat:  we compile it twice, once with code
6353  * generation turned off and size counting turned on, and once "for real".
6354  * This also means that we don't allocate space until we are sure that the
6355  * thing really will compile successfully, and we never have to move the
6356  * code and thus invalidate pointers into it.  (Note that it has to be in
6357  * one piece because free() must be able to free it all.) [NB: not true in perl]
6358  *
6359  * Beware that the optimization-preparation code in here knows about some
6360  * of the structure of the compiled regexp.  [I'll say.]
6361  */
6362
6363 REGEXP *
6364 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6365                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6366                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6367 {
6368     REGEXP *rx;
6369     struct regexp *r;
6370     regexp_internal *ri;
6371     STRLEN plen;
6372     char *exp;
6373     regnode *scan;
6374     I32 flags;
6375     SSize_t minlen = 0;
6376     U32 rx_flags;
6377     SV *pat;
6378     SV *code_blocksv = NULL;
6379     SV** new_patternp = patternp;
6380
6381     /* these are all flags - maybe they should be turned
6382      * into a single int with different bit masks */
6383     I32 sawlookahead = 0;
6384     I32 sawplus = 0;
6385     I32 sawopen = 0;
6386     I32 sawminmod = 0;
6387
6388     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6389     bool recompile = 0;
6390     bool runtime_code = 0;
6391     scan_data_t data;
6392     RExC_state_t RExC_state;
6393     RExC_state_t * const pRExC_state = &RExC_state;
6394 #ifdef TRIE_STUDY_OPT
6395     int restudied = 0;
6396     RExC_state_t copyRExC_state;
6397 #endif
6398     GET_RE_DEBUG_FLAGS_DECL;
6399
6400     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6401
6402     DEBUG_r(if (!PL_colorset) reginitcolors());
6403
6404 #ifndef PERL_IN_XSUB_RE
6405     /* Initialize these here instead of as-needed, as is quick and avoids
6406      * having to test them each time otherwise */
6407     if (! PL_AboveLatin1) {
6408         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6409         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6410         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6411         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6412         PL_HasMultiCharFold =
6413                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6414
6415         /* This is calculated here, because the Perl program that generates the
6416          * static global ones doesn't currently have access to
6417          * NUM_ANYOF_CODE_POINTS */
6418         PL_InBitmap = _new_invlist(2);
6419         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6420                                                     NUM_ANYOF_CODE_POINTS - 1);
6421     }
6422 #endif
6423
6424     pRExC_state->code_blocks = NULL;
6425     pRExC_state->num_code_blocks = 0;
6426
6427     if (is_bare_re)
6428         *is_bare_re = FALSE;
6429
6430     if (expr && (expr->op_type == OP_LIST ||
6431                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6432         /* allocate code_blocks if needed */
6433         OP *o;
6434         int ncode = 0;
6435
6436         for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6437             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6438                 ncode++; /* count of DO blocks */
6439         if (ncode) {
6440             pRExC_state->num_code_blocks = ncode;
6441             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6442         }
6443     }
6444
6445     if (!pat_count) {
6446         /* compile-time pattern with just OP_CONSTs and DO blocks */
6447
6448         int n;
6449         OP *o;
6450
6451         /* find how many CONSTs there are */
6452         assert(expr);
6453         n = 0;
6454         if (expr->op_type == OP_CONST)
6455             n = 1;
6456         else
6457             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6458                 if (o->op_type == OP_CONST)
6459                     n++;
6460             }
6461
6462         /* fake up an SV array */
6463
6464         assert(!new_patternp);
6465         Newx(new_patternp, n, SV*);
6466         SAVEFREEPV(new_patternp);
6467         pat_count = n;
6468
6469         n = 0;
6470         if (expr->op_type == OP_CONST)
6471             new_patternp[n] = cSVOPx_sv(expr);
6472         else
6473             for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6474                 if (o->op_type == OP_CONST)
6475                     new_patternp[n++] = cSVOPo_sv;
6476             }
6477
6478     }
6479
6480     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6481         "Assembling pattern from %d elements%s\n", pat_count,
6482             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6483
6484     /* set expr to the first arg op */
6485
6486     if (pRExC_state->num_code_blocks
6487          && expr->op_type != OP_CONST)
6488     {
6489             expr = cLISTOPx(expr)->op_first;
6490             assert(   expr->op_type == OP_PUSHMARK
6491                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6492                    || expr->op_type == OP_PADRANGE);
6493             expr = OP_SIBLING(expr);
6494     }
6495
6496     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6497                         expr, &recompile, NULL);
6498
6499     /* handle bare (possibly after overloading) regex: foo =~ $re */
6500     {
6501         SV *re = pat;
6502         if (SvROK(re))
6503             re = SvRV(re);
6504         if (SvTYPE(re) == SVt_REGEXP) {
6505             if (is_bare_re)
6506                 *is_bare_re = TRUE;
6507             SvREFCNT_inc(re);
6508             Safefree(pRExC_state->code_blocks);
6509             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6510                 "Precompiled pattern%s\n",
6511                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6512
6513             return (REGEXP*)re;
6514         }
6515     }
6516
6517     exp = SvPV_nomg(pat, plen);
6518
6519     if (!eng->op_comp) {
6520         if ((SvUTF8(pat) && IN_BYTES)
6521                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6522         {
6523             /* make a temporary copy; either to convert to bytes,
6524              * or to avoid repeating get-magic / overloaded stringify */
6525             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6526                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6527         }
6528         Safefree(pRExC_state->code_blocks);
6529         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6530     }
6531
6532     /* ignore the utf8ness if the pattern is 0 length */
6533     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6534     RExC_uni_semantics = 0;
6535     RExC_contains_locale = 0;
6536     RExC_contains_i = 0;
6537     pRExC_state->runtime_code_qr = NULL;
6538
6539     DEBUG_COMPILE_r({
6540             SV *dsv= sv_newmortal();
6541             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6542             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6543                           PL_colors[4],PL_colors[5],s);
6544         });
6545
6546   redo_first_pass:
6547     /* we jump here if we upgrade the pattern to utf8 and have to
6548      * recompile */
6549
6550     if ((pm_flags & PMf_USE_RE_EVAL)
6551                 /* this second condition covers the non-regex literal case,
6552                  * i.e.  $foo =~ '(?{})'. */
6553                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6554     )
6555         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6556
6557     /* return old regex if pattern hasn't changed */
6558     /* XXX: note in the below we have to check the flags as well as the
6559      * pattern.
6560      *
6561      * Things get a touch tricky as we have to compare the utf8 flag
6562      * independently from the compile flags.  */
6563
6564     if (   old_re
6565         && !recompile
6566         && !!RX_UTF8(old_re) == !!RExC_utf8
6567         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6568         && RX_PRECOMP(old_re)
6569         && RX_PRELEN(old_re) == plen
6570         && memEQ(RX_PRECOMP(old_re), exp, plen)
6571         && !runtime_code /* with runtime code, always recompile */ )
6572     {
6573         Safefree(pRExC_state->code_blocks);
6574         return old_re;
6575     }
6576
6577     rx_flags = orig_rx_flags;
6578
6579     if (rx_flags & PMf_FOLD) {
6580         RExC_contains_i = 1;
6581     }
6582     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6583
6584         /* Set to use unicode semantics if the pattern is in utf8 and has the
6585          * 'depends' charset specified, as it means unicode when utf8  */
6586         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6587     }
6588
6589     RExC_precomp = exp;
6590     RExC_flags = rx_flags;
6591     RExC_pm_flags = pm_flags;
6592
6593     if (runtime_code) {
6594         if (TAINTING_get && TAINT_get)
6595             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6596
6597         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6598             /* whoops, we have a non-utf8 pattern, whilst run-time code
6599              * got compiled as utf8. Try again with a utf8 pattern */
6600             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6601                                     pRExC_state->num_code_blocks);
6602             goto redo_first_pass;
6603         }
6604     }
6605     assert(!pRExC_state->runtime_code_qr);
6606
6607     RExC_sawback = 0;
6608
6609     RExC_seen = 0;
6610     RExC_maxlen = 0;
6611     RExC_in_lookbehind = 0;
6612     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6613     RExC_extralen = 0;
6614     RExC_override_recoding = 0;
6615     RExC_in_multi_char_class = 0;
6616
6617     /* First pass: determine size, legality. */
6618     RExC_parse = exp;
6619     RExC_start = exp;
6620     RExC_end = exp + plen;
6621     RExC_naughty = 0;
6622     RExC_npar = 1;
6623     RExC_nestroot = 0;
6624     RExC_size = 0L;
6625     RExC_emit = (regnode *) &RExC_emit_dummy;
6626     RExC_whilem_seen = 0;
6627     RExC_open_parens = NULL;
6628     RExC_close_parens = NULL;
6629     RExC_opend = NULL;
6630     RExC_paren_names = NULL;
6631 #ifdef DEBUGGING
6632     RExC_paren_name_list = NULL;
6633 #endif
6634     RExC_recurse = NULL;
6635     RExC_study_chunk_recursed = NULL;
6636     RExC_study_chunk_recursed_bytes= 0;
6637     RExC_recurse_count = 0;
6638     pRExC_state->code_index = 0;
6639
6640 #if 0 /* REGC() is (currently) a NOP at the first pass.
6641        * Clever compilers notice this and complain. --jhi */
6642     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6643 #endif
6644     DEBUG_PARSE_r(
6645         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6646         RExC_lastnum=0;
6647         RExC_lastparse=NULL;
6648     );
6649     /* reg may croak on us, not giving us a chance to free
6650        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6651        need it to survive as long as the regexp (qr/(?{})/).
6652        We must check that code_blocksv is not already set, because we may
6653        have jumped back to restart the sizing pass. */
6654     if (pRExC_state->code_blocks && !code_blocksv) {
6655         code_blocksv = newSV_type(SVt_PV);
6656         SAVEFREESV(code_blocksv);
6657         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6658         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6659     }
6660     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6661         /* It's possible to write a regexp in ascii that represents Unicode
6662         codepoints outside of the byte range, such as via \x{100}. If we
6663         detect such a sequence we have to convert the entire pattern to utf8
6664         and then recompile, as our sizing calculation will have been based
6665         on 1 byte == 1 character, but we will need to use utf8 to encode
6666         at least some part of the pattern, and therefore must convert the whole
6667         thing.
6668         -- dmq */
6669         if (flags & RESTART_UTF8) {
6670             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6671                                     pRExC_state->num_code_blocks);
6672             goto redo_first_pass;
6673         }
6674         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6675     }
6676     if (code_blocksv)
6677         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6678
6679     DEBUG_PARSE_r({
6680         PerlIO_printf(Perl_debug_log,
6681             "Required size %"IVdf" nodes\n"
6682             "Starting second pass (creation)\n",
6683             (IV)RExC_size);
6684         RExC_lastnum=0;
6685         RExC_lastparse=NULL;
6686     });
6687
6688     /* The first pass could have found things that force Unicode semantics */
6689     if ((RExC_utf8 || RExC_uni_semantics)
6690          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6691     {
6692         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6693     }
6694
6695     /* Small enough for pointer-storage convention?
6696        If extralen==0, this means that we will not need long jumps. */
6697     if (RExC_size >= 0x10000L && RExC_extralen)
6698         RExC_size += RExC_extralen;
6699     else
6700         RExC_extralen = 0;
6701     if (RExC_whilem_seen > 15)
6702         RExC_whilem_seen = 15;
6703
6704     /* Allocate space and zero-initialize. Note, the two step process
6705        of zeroing when in debug mode, thus anything assigned has to
6706        happen after that */
6707     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6708     r = ReANY(rx);
6709     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6710          char, regexp_internal);
6711     if ( r == NULL || ri == NULL )
6712         FAIL("Regexp out of space");
6713 #ifdef DEBUGGING
6714     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6715     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6716          char);
6717 #else
6718     /* bulk initialize base fields with 0. */
6719     Zero(ri, sizeof(regexp_internal), char);
6720 #endif
6721
6722     /* non-zero initialization begins here */
6723     RXi_SET( r, ri );
6724     r->engine= eng;
6725     r->extflags = rx_flags;
6726     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6727
6728     if (pm_flags & PMf_IS_QR) {
6729         ri->code_blocks = pRExC_state->code_blocks;
6730         ri->num_code_blocks = pRExC_state->num_code_blocks;
6731     }
6732     else
6733     {
6734         int n;
6735         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6736             if (pRExC_state->code_blocks[n].src_regex)
6737                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6738         SAVEFREEPV(pRExC_state->code_blocks);
6739     }
6740
6741     {
6742         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6743         bool has_charset = (get_regex_charset(r->extflags)
6744                                                     != REGEX_DEPENDS_CHARSET);
6745
6746         /* The caret is output if there are any defaults: if not all the STD
6747          * flags are set, or if no character set specifier is needed */
6748         bool has_default =
6749                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6750                     || ! has_charset);
6751         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6752                                                    == REG_RUN_ON_COMMENT_SEEN);
6753         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6754                             >> RXf_PMf_STD_PMMOD_SHIFT);
6755         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6756         char *p;
6757         /* Allocate for the worst case, which is all the std flags are turned
6758          * on.  If more precision is desired, we could do a population count of
6759          * the flags set.  This could be done with a small lookup table, or by
6760          * shifting, masking and adding, or even, when available, assembly
6761          * language for a machine-language population count.
6762          * We never output a minus, as all those are defaults, so are
6763          * covered by the caret */
6764         const STRLEN wraplen = plen + has_p + has_runon
6765             + has_default       /* If needs a caret */
6766
6767                 /* If needs a character set specifier */
6768             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6769             + (sizeof(STD_PAT_MODS) - 1)
6770             + (sizeof("(?:)") - 1);
6771
6772         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6773         r->xpv_len_u.xpvlenu_pv = p;
6774         if (RExC_utf8)
6775             SvFLAGS(rx) |= SVf_UTF8;
6776         *p++='('; *p++='?';
6777
6778         /* If a default, cover it using the caret */
6779         if (has_default) {
6780             *p++= DEFAULT_PAT_MOD;
6781         }
6782         if (has_charset) {
6783             STRLEN len;
6784             const char* const name = get_regex_charset_name(r->extflags, &len);
6785             Copy(name, p, len, char);
6786             p += len;
6787         }
6788         if (has_p)
6789             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6790         {
6791             char ch;
6792             while((ch = *fptr++)) {
6793                 if(reganch & 1)
6794                     *p++ = ch;
6795                 reganch >>= 1;
6796             }
6797         }
6798
6799         *p++ = ':';
6800         Copy(RExC_precomp, p, plen, char);
6801         assert ((RX_WRAPPED(rx) - p) < 16);
6802         r->pre_prefix = p - RX_WRAPPED(rx);
6803         p += plen;
6804         if (has_runon)
6805             *p++ = '\n';
6806         *p++ = ')';
6807         *p = 0;
6808         SvCUR_set(rx, p - RX_WRAPPED(rx));
6809     }
6810
6811     r->intflags = 0;
6812     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6813
6814     /* setup various meta data about recursion, this all requires
6815      * RExC_npar to be correctly set, and a bit later on we clear it */
6816     if (RExC_seen & REG_RECURSE_SEEN) {
6817         Newxz(RExC_open_parens, RExC_npar,regnode *);
6818         SAVEFREEPV(RExC_open_parens);
6819         Newxz(RExC_close_parens,RExC_npar,regnode *);
6820         SAVEFREEPV(RExC_close_parens);
6821     }
6822     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6823         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6824          * So its 1 if there are no parens. */
6825         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6826                                          ((RExC_npar & 0x07) != 0);
6827         Newx(RExC_study_chunk_recursed,
6828              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6829         SAVEFREEPV(RExC_study_chunk_recursed);
6830     }
6831
6832     /* Useful during FAIL. */
6833 #ifdef RE_TRACK_PATTERN_OFFSETS
6834     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6835     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6836                           "%s %"UVuf" bytes for offset annotations.\n",
6837                           ri->u.offsets ? "Got" : "Couldn't get",
6838                           (UV)((2*RExC_size+1) * sizeof(U32))));
6839 #endif
6840     SetProgLen(ri,RExC_size);
6841     RExC_rx_sv = rx;
6842     RExC_rx = r;
6843     RExC_rxi = ri;
6844
6845     /* Second pass: emit code. */
6846     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6847     RExC_pm_flags = pm_flags;
6848     RExC_parse = exp;
6849     RExC_end = exp + plen;
6850     RExC_naughty = 0;
6851     RExC_npar = 1;
6852     RExC_emit_start = ri->program;
6853     RExC_emit = ri->program;
6854     RExC_emit_bound = ri->program + RExC_size + 1;
6855     pRExC_state->code_index = 0;
6856
6857     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6858     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6859         ReREFCNT_dec(rx);
6860         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6861     }
6862     /* XXXX To minimize changes to RE engine we always allocate
6863        3-units-long substrs field. */
6864     Newx(r->substrs, 1, struct reg_substr_data);
6865     if (RExC_recurse_count) {
6866         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6867         SAVEFREEPV(RExC_recurse);
6868     }
6869
6870 reStudy:
6871     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6872     DEBUG_r(
6873         RExC_study_chunk_recursed_count= 0;
6874     );
6875     Zero(r->substrs, 1, struct reg_substr_data);
6876     if (RExC_study_chunk_recursed)
6877         Zero(RExC_study_chunk_recursed,
6878              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6879
6880 #ifdef TRIE_STUDY_OPT
6881     if (!restudied) {
6882         StructCopy(&zero_scan_data, &data, scan_data_t);
6883         copyRExC_state = RExC_state;
6884     } else {
6885         U32 seen=RExC_seen;
6886         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6887
6888         RExC_state = copyRExC_state;
6889         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6890             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6891         else
6892             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6893         StructCopy(&zero_scan_data, &data, scan_data_t);
6894     }
6895 #else
6896     StructCopy(&zero_scan_data, &data, scan_data_t);
6897 #endif
6898
6899     /* Dig out information for optimizations. */
6900     r->extflags = RExC_flags; /* was pm_op */
6901     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6902
6903     if (UTF)
6904         SvUTF8_on(rx);  /* Unicode in it? */
6905     ri->regstclass = NULL;
6906     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6907         r->intflags |= PREGf_NAUGHTY;
6908     scan = ri->program + 1;             /* First BRANCH. */
6909
6910     /* testing for BRANCH here tells us whether there is "must appear"
6911        data in the pattern. If there is then we can use it for optimisations */
6912     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6913                                                   */
6914         SSize_t fake;
6915         STRLEN longest_float_length, longest_fixed_length;
6916         regnode_ssc ch_class; /* pointed to by data */
6917         int stclass_flag;
6918         SSize_t last_close = 0; /* pointed to by data */
6919         regnode *first= scan;
6920         regnode *first_next= regnext(first);
6921         /*
6922          * Skip introductions and multiplicators >= 1
6923          * so that we can extract the 'meat' of the pattern that must
6924          * match in the large if() sequence following.
6925          * NOTE that EXACT is NOT covered here, as it is normally
6926          * picked up by the optimiser separately.
6927          *
6928          * This is unfortunate as the optimiser isnt handling lookahead
6929          * properly currently.
6930          *
6931          */
6932         while ((OP(first) == OPEN && (sawopen = 1)) ||
6933                /* An OR of *one* alternative - should not happen now. */
6934             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6935             /* for now we can't handle lookbehind IFMATCH*/
6936             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6937             (OP(first) == PLUS) ||
6938             (OP(first) == MINMOD) ||
6939                /* An {n,m} with n>0 */
6940             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6941             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6942         {
6943                 /*
6944                  * the only op that could be a regnode is PLUS, all the rest
6945                  * will be regnode_1 or regnode_2.
6946                  *
6947                  * (yves doesn't think this is true)
6948                  */
6949                 if (OP(first) == PLUS)
6950                     sawplus = 1;
6951                 else {
6952                     if (OP(first) == MINMOD)
6953                         sawminmod = 1;
6954                     first += regarglen[OP(first)];
6955                 }
6956                 first = NEXTOPER(first);
6957                 first_next= regnext(first);
6958         }
6959
6960         /* Starting-point info. */
6961       again:
6962         DEBUG_PEEP("first:",first,0);
6963         /* Ignore EXACT as we deal with it later. */
6964         if (PL_regkind[OP(first)] == EXACT) {
6965             if (OP(first) == EXACT)
6966                 NOOP;   /* Empty, get anchored substr later. */
6967             else
6968                 ri->regstclass = first;
6969         }
6970 #ifdef TRIE_STCLASS
6971         else if (PL_regkind[OP(first)] == TRIE &&
6972                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6973         {
6974             /* this can happen only on restudy */
6975             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6976         }
6977 #endif
6978         else if (REGNODE_SIMPLE(OP(first)))
6979             ri->regstclass = first;
6980         else if (PL_regkind[OP(first)] == BOUND ||
6981                  PL_regkind[OP(first)] == NBOUND)
6982             ri->regstclass = first;
6983         else if (PL_regkind[OP(first)] == BOL) {
6984             r->intflags |= (OP(first) == MBOL
6985                            ? PREGf_ANCH_MBOL
6986                            : PREGf_ANCH_SBOL);
6987             first = NEXTOPER(first);
6988             goto again;
6989         }
6990         else if (OP(first) == GPOS) {
6991             r->intflags |= PREGf_ANCH_GPOS;
6992             first = NEXTOPER(first);
6993             goto again;
6994         }
6995         else if ((!sawopen || !RExC_sawback) &&
6996             !sawlookahead &&
6997             (OP(first) == STAR &&
6998             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6999             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7000         {
7001             /* turn .* into ^.* with an implied $*=1 */
7002             const int type =
7003                 (OP(NEXTOPER(first)) == REG_ANY)
7004                     ? PREGf_ANCH_MBOL
7005                     : PREGf_ANCH_SBOL;
7006             r->intflags |= (type | PREGf_IMPLICIT);
7007             first = NEXTOPER(first);
7008             goto again;
7009         }
7010         if (sawplus && !sawminmod && !sawlookahead
7011             && (!sawopen || !RExC_sawback)
7012             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7013             /* x+ must match at the 1st pos of run of x's */
7014             r->intflags |= PREGf_SKIP;
7015
7016         /* Scan is after the zeroth branch, first is atomic matcher. */
7017 #ifdef TRIE_STUDY_OPT
7018         DEBUG_PARSE_r(
7019             if (!restudied)
7020                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7021                               (IV)(first - scan + 1))
7022         );
7023 #else
7024         DEBUG_PARSE_r(
7025             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7026                 (IV)(first - scan + 1))
7027         );
7028 #endif
7029
7030
7031         /*
7032         * If there's something expensive in the r.e., find the
7033         * longest literal string that must appear and make it the
7034         * regmust.  Resolve ties in favor of later strings, since
7035         * the regstart check works with the beginning of the r.e.
7036         * and avoiding duplication strengthens checking.  Not a
7037         * strong reason, but sufficient in the absence of others.
7038         * [Now we resolve ties in favor of the earlier string if
7039         * it happens that c_offset_min has been invalidated, since the
7040         * earlier string may buy us something the later one won't.]
7041         */
7042
7043         data.longest_fixed = newSVpvs("");
7044         data.longest_float = newSVpvs("");
7045         data.last_found = newSVpvs("");
7046         data.longest = &(data.longest_fixed);
7047         ENTER_with_name("study_chunk");
7048         SAVEFREESV(data.longest_fixed);
7049         SAVEFREESV(data.longest_float);
7050         SAVEFREESV(data.last_found);
7051         first = scan;
7052         if (!ri->regstclass) {
7053             ssc_init(pRExC_state, &ch_class);
7054             data.start_class = &ch_class;
7055             stclass_flag = SCF_DO_STCLASS_AND;
7056         } else                          /* XXXX Check for BOUND? */
7057             stclass_flag = 0;
7058         data.last_closep = &last_close;
7059
7060         DEBUG_RExC_seen();
7061         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7062                              scan + RExC_size, /* Up to end */
7063             &data, -1, 0, NULL,
7064             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7065                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7066             0);
7067
7068
7069         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7070
7071
7072         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7073              && data.last_start_min == 0 && data.last_end > 0
7074              && !RExC_seen_zerolen
7075              && !(RExC_seen & REG_VERBARG_SEEN)
7076              && !(RExC_seen & REG_GPOS_SEEN)
7077         ){
7078             r->extflags |= RXf_CHECK_ALL;
7079         }
7080         scan_commit(pRExC_state, &data,&minlen,0);
7081
7082         longest_float_length = CHR_SVLEN(data.longest_float);
7083
7084         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7085                    && data.offset_fixed == data.offset_float_min
7086                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7087             && S_setup_longest (aTHX_ pRExC_state,
7088                                     data.longest_float,
7089                                     &(r->float_utf8),
7090                                     &(r->float_substr),
7091                                     &(r->float_end_shift),
7092                                     data.lookbehind_float,
7093                                     data.offset_float_min,
7094                                     data.minlen_float,
7095                                     longest_float_length,
7096                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7097                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7098         {
7099             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7100             r->float_max_offset = data.offset_float_max;
7101             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7102                 r->float_max_offset -= data.lookbehind_float;
7103             SvREFCNT_inc_simple_void_NN(data.longest_float);
7104         }
7105         else {
7106             r->float_substr = r->float_utf8 = NULL;
7107             longest_float_length = 0;
7108         }
7109
7110         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7111
7112         if (S_setup_longest (aTHX_ pRExC_state,
7113                                 data.longest_fixed,
7114                                 &(r->anchored_utf8),
7115                                 &(r->anchored_substr),
7116                                 &(r->anchored_end_shift),
7117                                 data.lookbehind_fixed,
7118                                 data.offset_fixed,
7119                                 data.minlen_fixed,
7120                                 longest_fixed_length,
7121                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7122                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7123         {
7124             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7125             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7126         }
7127         else {
7128             r->anchored_substr = r->anchored_utf8 = NULL;
7129             longest_fixed_length = 0;
7130         }
7131         LEAVE_with_name("study_chunk");
7132
7133         if (ri->regstclass
7134             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7135             ri->regstclass = NULL;
7136
7137         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7138             && stclass_flag
7139             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7140             && is_ssc_worth_it(pRExC_state, data.start_class))
7141         {
7142             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7143
7144             ssc_finalize(pRExC_state, data.start_class);
7145
7146             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7147             StructCopy(data.start_class,
7148                        (regnode_ssc*)RExC_rxi->data->data[n],
7149                        regnode_ssc);
7150             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7151             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7152             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7153                       regprop(r, sv, (regnode*)data.start_class, NULL);
7154                       PerlIO_printf(Perl_debug_log,
7155                                     "synthetic stclass \"%s\".\n",
7156                                     SvPVX_const(sv));});
7157             data.start_class = NULL;
7158         }
7159
7160         /* A temporary algorithm prefers floated substr to fixed one to dig
7161          * more info. */
7162         if (longest_fixed_length > longest_float_length) {
7163             r->substrs->check_ix = 0;
7164             r->check_end_shift = r->anchored_end_shift;
7165             r->check_substr = r->anchored_substr;
7166             r->check_utf8 = r->anchored_utf8;
7167             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7168             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7169                 r->intflags |= PREGf_NOSCAN;
7170         }
7171         else {
7172             r->substrs->check_ix = 1;
7173             r->check_end_shift = r->float_end_shift;
7174             r->check_substr = r->float_substr;
7175             r->check_utf8 = r->float_utf8;
7176             r->check_offset_min = r->float_min_offset;
7177             r->check_offset_max = r->float_max_offset;
7178         }
7179         if ((r->check_substr || r->check_utf8) ) {
7180             r->extflags |= RXf_USE_INTUIT;
7181             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7182                 r->extflags |= RXf_INTUIT_TAIL;
7183         }
7184         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7185
7186         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7187         if ( (STRLEN)minlen < longest_float_length )
7188             minlen= longest_float_length;
7189         if ( (STRLEN)minlen < longest_fixed_length )
7190             minlen= longest_fixed_length;
7191         */
7192     }
7193     else {
7194         /* Several toplevels. Best we can is to set minlen. */
7195         SSize_t fake;
7196         regnode_ssc ch_class;
7197         SSize_t last_close = 0;
7198
7199         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7200
7201         scan = ri->program + 1;
7202         ssc_init(pRExC_state, &ch_class);
7203         data.start_class = &ch_class;
7204         data.last_closep = &last_close;
7205
7206         DEBUG_RExC_seen();
7207         minlen = study_chunk(pRExC_state,
7208             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7209             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7210                                                       ? SCF_TRIE_DOING_RESTUDY
7211                                                       : 0),
7212             0);
7213
7214         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7215
7216         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7217                 = r->float_substr = r->float_utf8 = NULL;
7218
7219         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7220             && is_ssc_worth_it(pRExC_state, data.start_class))
7221         {
7222             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7223
7224             ssc_finalize(pRExC_state, data.start_class);
7225
7226             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7227             StructCopy(data.start_class,
7228                        (regnode_ssc*)RExC_rxi->data->data[n],
7229                        regnode_ssc);
7230             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7231             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7232             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7233                       regprop(r, sv, (regnode*)data.start_class, NULL);
7234                       PerlIO_printf(Perl_debug_log,
7235                                     "synthetic stclass \"%s\".\n",
7236                                     SvPVX_const(sv));});
7237             data.start_class = NULL;
7238         }
7239     }
7240
7241     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7242         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7243         r->maxlen = REG_INFTY;
7244     }
7245     else {
7246         r->maxlen = RExC_maxlen;
7247     }
7248
7249     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7250        the "real" pattern. */
7251     DEBUG_OPTIMISE_r({
7252         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7253                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7254     });
7255     r->minlenret = minlen;
7256     if (r->minlen < minlen)
7257         r->minlen = minlen;
7258
7259     if (RExC_seen & REG_GPOS_SEEN)
7260         r->intflags |= PREGf_GPOS_SEEN;
7261     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7262         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7263                                                 lookbehind */
7264     if (pRExC_state->num_code_blocks)
7265         r->extflags |= RXf_EVAL_SEEN;
7266     if (RExC_seen & REG_CANY_SEEN)
7267         r->intflags |= PREGf_CANY_SEEN;
7268     if (RExC_seen & REG_VERBARG_SEEN)
7269     {
7270         r->intflags |= PREGf_VERBARG_SEEN;
7271         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7272     }
7273     if (RExC_seen & REG_CUTGROUP_SEEN)
7274         r->intflags |= PREGf_CUTGROUP_SEEN;
7275     if (pm_flags & PMf_USE_RE_EVAL)
7276         r->intflags |= PREGf_USE_RE_EVAL;
7277     if (RExC_paren_names)
7278         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7279     else
7280         RXp_PAREN_NAMES(r) = NULL;
7281
7282     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7283      * so it can be used in pp.c */
7284     if (r->intflags & PREGf_ANCH)
7285         r->extflags |= RXf_IS_ANCHORED;
7286
7287
7288     {
7289         /* this is used to identify "special" patterns that might result
7290          * in Perl NOT calling the regex engine and instead doing the match "itself",
7291          * particularly special cases in split//. By having the regex compiler
7292          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7293          * we avoid weird issues with equivalent patterns resulting in different behavior,
7294          * AND we allow non Perl engines to get the same optimizations by the setting the
7295          * flags appropriately - Yves */
7296         regnode *first = ri->program + 1;
7297         U8 fop = OP(first);
7298         regnode *next = NEXTOPER(first);
7299         U8 nop = OP(next);
7300
7301         if (PL_regkind[fop] == NOTHING && nop == END)
7302             r->extflags |= RXf_NULL;
7303         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7304             /* when fop is SBOL first->flags will be true only when it was
7305              * produced by parsing /\A/, and not when parsing /^/. This is
7306              * very important for the split code as there we want to
7307              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7308              * See rt #122761 for more details. -- Yves */
7309             r->extflags |= RXf_START_ONLY;
7310         else if (fop == PLUS
7311                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7312                  && OP(regnext(first)) == END)
7313             r->extflags |= RXf_WHITE;
7314         else if ( r->extflags & RXf_SPLIT
7315                   && fop == EXACT
7316                   && STR_LEN(first) == 1
7317                   && *(STRING(first)) == ' '
7318                   && OP(regnext(first)) == END )
7319             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7320
7321     }
7322
7323     if (RExC_contains_locale) {
7324         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7325     }
7326
7327 #ifdef DEBUGGING
7328     if (RExC_paren_names) {
7329         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7330         ri->data->data[ri->name_list_idx]
7331                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7332     } else
7333 #endif
7334         ri->name_list_idx = 0;
7335
7336     if (RExC_recurse_count) {
7337         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7338             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7339             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7340         }
7341     }
7342     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7343     /* assume we don't need to swap parens around before we match */
7344     DEBUG_TEST_r({
7345         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7346             (unsigned long)RExC_study_chunk_recursed_count);
7347     });
7348     DEBUG_DUMP_r({
7349         DEBUG_RExC_seen();
7350         PerlIO_printf(Perl_debug_log,"Final program:\n");
7351         regdump(r);
7352     });
7353 #ifdef RE_TRACK_PATTERN_OFFSETS
7354     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7355         const STRLEN len = ri->u.offsets[0];
7356         STRLEN i;
7357         GET_RE_DEBUG_FLAGS_DECL;
7358         PerlIO_printf(Perl_debug_log,
7359                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7360         for (i = 1; i <= len; i++) {
7361             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7362                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7363                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7364             }
7365         PerlIO_printf(Perl_debug_log, "\n");
7366     });
7367 #endif
7368
7369 #ifdef USE_ITHREADS
7370     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7371      * by setting the regexp SV to readonly-only instead. If the
7372      * pattern's been recompiled, the USEDness should remain. */
7373     if (old_re && SvREADONLY(old_re))
7374         SvREADONLY_on(rx);
7375 #endif
7376     return rx;
7377 }
7378
7379
7380 SV*
7381 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7382                     const U32 flags)
7383 {
7384     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7385
7386     PERL_UNUSED_ARG(value);
7387
7388     if (flags & RXapif_FETCH) {
7389         return reg_named_buff_fetch(rx, key, flags);
7390     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7391         Perl_croak_no_modify();
7392         return NULL;
7393     } else if (flags & RXapif_EXISTS) {
7394         return reg_named_buff_exists(rx, key, flags)
7395             ? &PL_sv_yes
7396             : &PL_sv_no;
7397     } else if (flags & RXapif_REGNAMES) {
7398         return reg_named_buff_all(rx, flags);
7399     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7400         return reg_named_buff_scalar(rx, flags);
7401     } else {
7402         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7403         return NULL;
7404     }
7405 }
7406
7407 SV*
7408 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7409                          const U32 flags)
7410 {
7411     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7412     PERL_UNUSED_ARG(lastkey);
7413
7414     if (flags & RXapif_FIRSTKEY)
7415         return reg_named_buff_firstkey(rx, flags);
7416     else if (flags & RXapif_NEXTKEY)
7417         return reg_named_buff_nextkey(rx, flags);
7418     else {
7419         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7420                                             (int)flags);
7421         return NULL;
7422     }
7423 }
7424
7425 SV*
7426 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7427                           const U32 flags)
7428 {
7429     AV *retarray = NULL;
7430     SV *ret;
7431     struct regexp *const rx = ReANY(r);
7432
7433     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7434
7435     if (flags & RXapif_ALL)
7436         retarray=newAV();
7437
7438     if (rx && RXp_PAREN_NAMES(rx)) {
7439         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7440         if (he_str) {
7441             IV i;
7442             SV* sv_dat=HeVAL(he_str);
7443             I32 *nums=(I32*)SvPVX(sv_dat);
7444             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7445                 if ((I32)(rx->nparens) >= nums[i]
7446                     && rx->offs[nums[i]].start != -1
7447                     && rx->offs[nums[i]].end != -1)
7448                 {
7449                     ret = newSVpvs("");
7450                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7451                     if (!retarray)
7452                         return ret;
7453                 } else {
7454                     if (retarray)
7455                         ret = newSVsv(&PL_sv_undef);
7456                 }
7457                 if (retarray)
7458                     av_push(retarray, ret);
7459             }
7460             if (retarray)
7461                 return newRV_noinc(MUTABLE_SV(retarray));
7462         }
7463     }
7464     return NULL;
7465 }
7466
7467 bool
7468 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7469                            const U32 flags)
7470 {
7471     struct regexp *const rx = ReANY(r);
7472
7473     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7474
7475     if (rx && RXp_PAREN_NAMES(rx)) {
7476         if (flags & RXapif_ALL) {
7477             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7478         } else {
7479             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7480             if (sv) {
7481                 SvREFCNT_dec_NN(sv);
7482                 return TRUE;
7483             } else {
7484                 return FALSE;
7485             }
7486         }
7487     } else {
7488         return FALSE;
7489     }
7490 }
7491
7492 SV*
7493 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7494 {
7495     struct regexp *const rx = ReANY(r);
7496
7497     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7498
7499     if ( rx && RXp_PAREN_NAMES(rx) ) {
7500         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7501
7502         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7503     } else {
7504         return FALSE;
7505     }
7506 }
7507
7508 SV*
7509 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7510 {
7511     struct regexp *const rx = ReANY(r);
7512     GET_RE_DEBUG_FLAGS_DECL;
7513
7514     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7515
7516     if (rx && RXp_PAREN_NAMES(rx)) {
7517         HV *hv = RXp_PAREN_NAMES(rx);
7518         HE *temphe;
7519         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7520             IV i;
7521             IV parno = 0;
7522             SV* sv_dat = HeVAL(temphe);
7523             I32 *nums = (I32*)SvPVX(sv_dat);
7524             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7525                 if ((I32)(rx->lastparen) >= nums[i] &&
7526                     rx->offs[nums[i]].start != -1 &&
7527                     rx->offs[nums[i]].end != -1)
7528                 {
7529                     parno = nums[i];
7530                     break;
7531                 }
7532             }
7533             if (parno || flags & RXapif_ALL) {
7534                 return newSVhek(HeKEY_hek(temphe));
7535             }
7536         }
7537     }
7538     return NULL;
7539 }
7540
7541 SV*
7542 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7543 {
7544     SV *ret;
7545     AV *av;
7546     SSize_t length;
7547     struct regexp *const rx = ReANY(r);
7548
7549     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7550
7551     if (rx && RXp_PAREN_NAMES(rx)) {
7552         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7553             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7554         } else if (flags & RXapif_ONE) {
7555             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7556             av = MUTABLE_AV(SvRV(ret));
7557             length = av_tindex(av);
7558             SvREFCNT_dec_NN(ret);
7559             return newSViv(length + 1);
7560         } else {
7561             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7562                                                 (int)flags);
7563             return NULL;
7564         }
7565     }
7566     return &PL_sv_undef;
7567 }
7568
7569 SV*
7570 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7571 {
7572     struct regexp *const rx = ReANY(r);
7573     AV *av = newAV();
7574
7575     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7576
7577     if (rx && RXp_PAREN_NAMES(rx)) {
7578         HV *hv= RXp_PAREN_NAMES(rx);
7579         HE *temphe;
7580         (void)hv_iterinit(hv);
7581         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7582             IV i;
7583             IV parno = 0;
7584             SV* sv_dat = HeVAL(temphe);
7585             I32 *nums = (I32*)SvPVX(sv_dat);
7586             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7587                 if ((I32)(rx->lastparen) >= nums[i] &&
7588                     rx->offs[nums[i]].start != -1 &&
7589                     rx->offs[nums[i]].end != -1)
7590                 {
7591                     parno = nums[i];
7592                     break;
7593                 }
7594             }
7595             if (parno || flags & RXapif_ALL) {
7596                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7597             }
7598         }
7599     }
7600
7601     return newRV_noinc(MUTABLE_SV(av));
7602 }
7603
7604 void
7605 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7606                              SV * const sv)
7607 {
7608     struct regexp *const rx = ReANY(r);
7609     char *s = NULL;
7610     SSize_t i = 0;
7611     SSize_t s1, t1;
7612     I32 n = paren;
7613
7614     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7615
7616     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7617            || n == RX_BUFF_IDX_CARET_FULLMATCH
7618            || n == RX_BUFF_IDX_CARET_POSTMATCH
7619        )
7620     {
7621         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7622         if (!keepcopy) {
7623             /* on something like
7624              *    $r = qr/.../;
7625              *    /$qr/p;
7626              * the KEEPCOPY is set on the PMOP rather than the regex */
7627             if (PL_curpm && r == PM_GETRE(PL_curpm))
7628                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7629         }
7630         if (!keepcopy)
7631             goto ret_undef;
7632     }
7633
7634     if (!rx->subbeg)
7635         goto ret_undef;
7636
7637     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7638         /* no need to distinguish between them any more */
7639         n = RX_BUFF_IDX_FULLMATCH;
7640
7641     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7642         && rx->offs[0].start != -1)
7643     {
7644         /* $`, ${^PREMATCH} */
7645         i = rx->offs[0].start;
7646         s = rx->subbeg;
7647     }
7648     else
7649     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7650         && rx->offs[0].end != -1)
7651     {
7652         /* $', ${^POSTMATCH} */
7653         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7654         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7655     }
7656     else
7657     if ( 0 <= n && n <= (I32)rx->nparens &&
7658         (s1 = rx->offs[n].start) != -1 &&
7659         (t1 = rx->offs[n].end) != -1)
7660     {
7661         /* $&, ${^MATCH},  $1 ... */
7662         i = t1 - s1;
7663         s = rx->subbeg + s1 - rx->suboffset;
7664     } else {
7665         goto ret_undef;
7666     }
7667
7668     assert(s >= rx->subbeg);
7669     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7670     if (i >= 0) {
7671 #ifdef NO_TAINT_SUPPORT
7672         sv_setpvn(sv, s, i);
7673 #else
7674         const int oldtainted = TAINT_get;
7675         TAINT_NOT;
7676         sv_setpvn(sv, s, i);
7677         TAINT_set(oldtainted);
7678 #endif
7679         if ( (rx->intflags & PREGf_CANY_SEEN)
7680             ? (RXp_MATCH_UTF8(rx)
7681                         && (!i || is_utf8_string((U8*)s, i)))
7682             : (RXp_MATCH_UTF8(rx)) )
7683         {
7684             SvUTF8_on(sv);
7685         }
7686         else
7687             SvUTF8_off(sv);
7688         if (TAINTING_get) {
7689             if (RXp_MATCH_TAINTED(rx)) {
7690                 if (SvTYPE(sv) >= SVt_PVMG) {
7691                     MAGIC* const mg = SvMAGIC(sv);
7692                     MAGIC* mgt;
7693                     TAINT;
7694                     SvMAGIC_set(sv, mg->mg_moremagic);
7695                     SvTAINT(sv);
7696                     if ((mgt = SvMAGIC(sv))) {
7697                         mg->mg_moremagic = mgt;
7698                         SvMAGIC_set(sv, mg);
7699                     }
7700                 } else {
7701                     TAINT;
7702                     SvTAINT(sv);
7703                 }
7704             } else
7705                 SvTAINTED_off(sv);
7706         }
7707     } else {
7708       ret_undef:
7709         sv_setsv(sv,&PL_sv_undef);
7710         return;
7711     }
7712 }
7713
7714 void
7715 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7716                                                          SV const * const value)
7717 {
7718     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7719
7720     PERL_UNUSED_ARG(rx);
7721     PERL_UNUSED_ARG(paren);
7722     PERL_UNUSED_ARG(value);
7723
7724     if (!PL_localizing)
7725         Perl_croak_no_modify();
7726 }
7727
7728 I32
7729 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7730                               const I32 paren)
7731 {
7732     struct regexp *const rx = ReANY(r);
7733     I32 i;
7734     I32 s1, t1;
7735
7736     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7737
7738     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7739         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7740         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7741     )
7742     {
7743         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7744         if (!keepcopy) {
7745             /* on something like
7746              *    $r = qr/.../;
7747              *    /$qr/p;
7748              * the KEEPCOPY is set on the PMOP rather than the regex */
7749             if (PL_curpm && r == PM_GETRE(PL_curpm))
7750                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7751         }
7752         if (!keepcopy)
7753             goto warn_undef;
7754     }
7755
7756     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7757     switch (paren) {
7758       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7759       case RX_BUFF_IDX_PREMATCH:       /* $` */
7760         if (rx->offs[0].start != -1) {
7761                         i = rx->offs[0].start;
7762                         if (i > 0) {
7763                                 s1 = 0;
7764                                 t1 = i;
7765                                 goto getlen;
7766                         }
7767             }
7768         return 0;
7769
7770       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7771       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7772             if (rx->offs[0].end != -1) {
7773                         i = rx->sublen - rx->offs[0].end;
7774                         if (i > 0) {
7775                                 s1 = rx->offs[0].end;
7776                                 t1 = rx->sublen;
7777                                 goto getlen;
7778                         }
7779             }
7780         return 0;
7781
7782       default: /* $& / ${^MATCH}, $1, $2, ... */
7783             if (paren <= (I32)rx->nparens &&
7784             (s1 = rx->offs[paren].start) != -1 &&
7785             (t1 = rx->offs[paren].end) != -1)
7786             {
7787             i = t1 - s1;
7788             goto getlen;
7789         } else {
7790           warn_undef:
7791             if (ckWARN(WARN_UNINITIALIZED))
7792                 report_uninit((const SV *)sv);
7793             return 0;
7794         }
7795     }
7796   getlen:
7797     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7798         const char * const s = rx->subbeg - rx->suboffset + s1;
7799         const U8 *ep;
7800         STRLEN el;
7801
7802         i = t1 - s1;
7803         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7804                         i = el;
7805     }
7806     return i;
7807 }
7808
7809 SV*
7810 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7811 {
7812     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7813         PERL_UNUSED_ARG(rx);
7814         if (0)
7815             return NULL;
7816         else
7817             return newSVpvs("Regexp");
7818 }
7819
7820 /* Scans the name of a named buffer from the pattern.
7821  * If flags is REG_RSN_RETURN_NULL returns null.
7822  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7823  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7824  * to the parsed name as looked up in the RExC_paren_names hash.
7825  * If there is an error throws a vFAIL().. type exception.
7826  */
7827
7828 #define REG_RSN_RETURN_NULL    0
7829 #define REG_RSN_RETURN_NAME    1
7830 #define REG_RSN_RETURN_DATA    2
7831
7832 STATIC SV*
7833 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7834 {
7835     char *name_start = RExC_parse;
7836
7837     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7838
7839     assert (RExC_parse <= RExC_end);
7840     if (RExC_parse == RExC_end) NOOP;
7841     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7842          /* skip IDFIRST by using do...while */
7843         if (UTF)
7844             do {
7845                 RExC_parse += UTF8SKIP(RExC_parse);
7846             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7847         else
7848             do {
7849                 RExC_parse++;
7850             } while (isWORDCHAR(*RExC_parse));
7851     } else {
7852         RExC_parse++; /* so the <- from the vFAIL is after the offending
7853                          character */
7854         vFAIL("Group name must start with a non-digit word character");
7855     }
7856     if ( flags ) {
7857         SV* sv_name
7858             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7859                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7860         if ( flags == REG_RSN_RETURN_NAME)
7861             return sv_name;
7862         else if (flags==REG_RSN_RETURN_DATA) {
7863             HE *he_str = NULL;
7864             SV *sv_dat = NULL;
7865             if ( ! sv_name )      /* should not happen*/
7866                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7867             if (RExC_paren_names)
7868                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7869             if ( he_str )
7870                 sv_dat = HeVAL(he_str);
7871             if ( ! sv_dat )
7872                 vFAIL("Reference to nonexistent named group");
7873             return sv_dat;
7874         }
7875         else {
7876             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7877                        (unsigned long) flags);
7878         }
7879         assert(0); /* NOT REACHED */
7880     }
7881     return NULL;
7882 }
7883
7884 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7885     int rem=(int)(RExC_end - RExC_parse);                       \
7886     int cut;                                                    \
7887     int num;                                                    \
7888     int iscut=0;                                                \
7889     if (rem>10) {                                               \
7890         rem=10;                                                 \
7891         iscut=1;                                                \
7892     }                                                           \
7893     cut=10-rem;                                                 \
7894     if (RExC_lastparse!=RExC_parse)                             \
7895         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7896             rem, RExC_parse,                                    \
7897             cut + 4,                                            \
7898             iscut ? "..." : "<"                                 \
7899         );                                                      \
7900     else                                                        \
7901         PerlIO_printf(Perl_debug_log,"%16s","");                \
7902                                                                 \
7903     if (SIZE_ONLY)                                              \
7904        num = RExC_size + 1;                                     \
7905     else                                                        \
7906        num=REG_NODE_NUM(RExC_emit);                             \
7907     if (RExC_lastnum!=num)                                      \
7908        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7909     else                                                        \
7910        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7911     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7912         (int)((depth*2)), "",                                   \
7913         (funcname)                                              \
7914     );                                                          \
7915     RExC_lastnum=num;                                           \
7916     RExC_lastparse=RExC_parse;                                  \
7917 })
7918
7919
7920
7921 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7922     DEBUG_PARSE_MSG((funcname));                            \
7923     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7924 })
7925 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7926     DEBUG_PARSE_MSG((funcname));                            \
7927     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7928 })
7929
7930 /* This section of code defines the inversion list object and its methods.  The
7931  * interfaces are highly subject to change, so as much as possible is static to
7932  * this file.  An inversion list is here implemented as a malloc'd C UV array
7933  * as an SVt_INVLIST scalar.
7934  *
7935  * An inversion list for Unicode is an array of code points, sorted by ordinal
7936  * number.  The zeroth element is the first code point in the list.  The 1th
7937  * element is the first element beyond that not in the list.  In other words,
7938  * the first range is
7939  *  invlist[0]..(invlist[1]-1)
7940  * The other ranges follow.  Thus every element whose index is divisible by two
7941  * marks the beginning of a range that is in the list, and every element not
7942  * divisible by two marks the beginning of a range not in the list.  A single
7943  * element inversion list that contains the single code point N generally
7944  * consists of two elements
7945  *  invlist[0] == N
7946  *  invlist[1] == N+1
7947  * (The exception is when N is the highest representable value on the
7948  * machine, in which case the list containing just it would be a single
7949  * element, itself.  By extension, if the last range in the list extends to
7950  * infinity, then the first element of that range will be in the inversion list
7951  * at a position that is divisible by two, and is the final element in the
7952  * list.)
7953  * Taking the complement (inverting) an inversion list is quite simple, if the
7954  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7955  * This implementation reserves an element at the beginning of each inversion
7956  * list to always contain 0; there is an additional flag in the header which
7957  * indicates if the list begins at the 0, or is offset to begin at the next
7958  * element.
7959  *
7960  * More about inversion lists can be found in "Unicode Demystified"
7961  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7962  * More will be coming when functionality is added later.
7963  *
7964  * The inversion list data structure is currently implemented as an SV pointing
7965  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7966  * array of UV whose memory management is automatically handled by the existing
7967  * facilities for SV's.
7968  *
7969  * Some of the methods should always be private to the implementation, and some
7970  * should eventually be made public */
7971
7972 /* The header definitions are in F<inline_invlist.c> */
7973
7974 PERL_STATIC_INLINE UV*
7975 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7976 {
7977     /* Returns a pointer to the first element in the inversion list's array.
7978      * This is called upon initialization of an inversion list.  Where the
7979      * array begins depends on whether the list has the code point U+0000 in it
7980      * or not.  The other parameter tells it whether the code that follows this
7981      * call is about to put a 0 in the inversion list or not.  The first
7982      * element is either the element reserved for 0, if TRUE, or the element
7983      * after it, if FALSE */
7984
7985     bool* offset = get_invlist_offset_addr(invlist);
7986     UV* zero_addr = (UV *) SvPVX(invlist);
7987
7988     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7989
7990     /* Must be empty */
7991     assert(! _invlist_len(invlist));
7992
7993     *zero_addr = 0;
7994
7995     /* 1^1 = 0; 1^0 = 1 */
7996     *offset = 1 ^ will_have_0;
7997     return zero_addr + *offset;
7998 }
7999
8000 PERL_STATIC_INLINE UV*
8001 S_invlist_array(SV* const invlist)
8002 {
8003     /* Returns the pointer to the inversion list's array.  Every time the
8004      * length changes, this needs to be called in case malloc or realloc moved
8005      * it */
8006
8007     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8008
8009     /* Must not be empty.  If these fail, you probably didn't check for <len>
8010      * being non-zero before trying to get the array */
8011     assert(_invlist_len(invlist));
8012
8013     /* The very first element always contains zero, The array begins either
8014      * there, or if the inversion list is offset, at the element after it.
8015      * The offset header field determines which; it contains 0 or 1 to indicate
8016      * how much additionally to add */
8017     assert(0 == *(SvPVX(invlist)));
8018     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8019 }
8020
8021 PERL_STATIC_INLINE void
8022 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8023 {
8024     /* Sets the current number of elements stored in the inversion list.
8025      * Updates SvCUR correspondingly */
8026     PERL_UNUSED_CONTEXT;
8027     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8028
8029     assert(SvTYPE(invlist) == SVt_INVLIST);
8030
8031     SvCUR_set(invlist,
8032               (len == 0)
8033                ? 0
8034                : TO_INTERNAL_SIZE(len + offset));
8035     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8036 }
8037
8038 PERL_STATIC_INLINE IV*
8039 S_get_invlist_previous_index_addr(SV* invlist)
8040 {
8041     /* Return the address of the IV that is reserved to hold the cached index
8042      * */
8043     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8044
8045     assert(SvTYPE(invlist) == SVt_INVLIST);
8046
8047     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8048 }
8049
8050 PERL_STATIC_INLINE IV
8051 S_invlist_previous_index(SV* const invlist)
8052 {
8053     /* Returns cached index of previous search */
8054
8055     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8056
8057     return *get_invlist_previous_index_addr(invlist);
8058 }
8059
8060 PERL_STATIC_INLINE void
8061 S_invlist_set_previous_index(SV* const invlist, const IV index)
8062 {
8063     /* Caches <index> for later retrieval */
8064
8065     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8066
8067     assert(index == 0 || index < (int) _invlist_len(invlist));
8068
8069     *get_invlist_previous_index_addr(invlist) = index;
8070 }
8071
8072 PERL_STATIC_INLINE UV
8073 S_invlist_max(SV* const invlist)
8074 {
8075     /* Returns the maximum number of elements storable in the inversion list's
8076      * array, without having to realloc() */
8077
8078     PERL_ARGS_ASSERT_INVLIST_MAX;
8079
8080     assert(SvTYPE(invlist) == SVt_INVLIST);
8081
8082     /* Assumes worst case, in which the 0 element is not counted in the
8083      * inversion list, so subtracts 1 for that */
8084     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8085            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8086            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8087 }
8088
8089 #ifndef PERL_IN_XSUB_RE
8090 SV*
8091 Perl__new_invlist(pTHX_ IV initial_size)
8092 {
8093
8094     /* Return a pointer to a newly constructed inversion list, with enough
8095      * space to store 'initial_size' elements.  If that number is negative, a
8096      * system default is used instead */
8097
8098     SV* new_list;
8099
8100     if (initial_size < 0) {
8101         initial_size = 10;
8102     }
8103
8104     /* Allocate the initial space */
8105     new_list = newSV_type(SVt_INVLIST);
8106
8107     /* First 1 is in case the zero element isn't in the list; second 1 is for
8108      * trailing NUL */
8109     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8110     invlist_set_len(new_list, 0, 0);
8111
8112     /* Force iterinit() to be used to get iteration to work */
8113     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8114
8115     *get_invlist_previous_index_addr(new_list) = 0;
8116
8117     return new_list;
8118 }
8119
8120 SV*
8121 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8122 {
8123     /* Return a pointer to a newly constructed inversion list, initialized to
8124      * point to <list>, which has to be in the exact correct inversion list
8125      * form, including internal fields.  Thus this is a dangerous routine that
8126      * should not be used in the wrong hands.  The passed in 'list' contains
8127      * several header fields at the beginning that are not part of the
8128      * inversion list body proper */
8129
8130     const STRLEN length = (STRLEN) list[0];
8131     const UV version_id =          list[1];
8132     const bool offset   =    cBOOL(list[2]);
8133 #define HEADER_LENGTH 3
8134     /* If any of the above changes in any way, you must change HEADER_LENGTH
8135      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8136      *      perl -E 'say int(rand 2**31-1)'
8137      */
8138 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8139                                         data structure type, so that one being
8140                                         passed in can be validated to be an
8141                                         inversion list of the correct vintage.
8142                                        */
8143
8144     SV* invlist = newSV_type(SVt_INVLIST);
8145
8146     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8147
8148     if (version_id != INVLIST_VERSION_ID) {
8149         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8150     }
8151
8152     /* The generated array passed in includes header elements that aren't part
8153      * of the list proper, so start it just after them */
8154     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8155
8156     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8157                                shouldn't touch it */
8158
8159     *(get_invlist_offset_addr(invlist)) = offset;
8160
8161     /* The 'length' passed to us is the physical number of elements in the
8162      * inversion list.  But if there is an offset the logical number is one
8163      * less than that */
8164     invlist_set_len(invlist, length  - offset, offset);
8165
8166     invlist_set_previous_index(invlist, 0);
8167
8168     /* Initialize the iteration pointer. */
8169     invlist_iterfinish(invlist);
8170
8171     SvREADONLY_on(invlist);
8172
8173     return invlist;
8174 }
8175 #endif /* ifndef PERL_IN_XSUB_RE */
8176
8177 STATIC void
8178 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8179 {
8180     /* Grow the maximum size of an inversion list */
8181
8182     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8183
8184     assert(SvTYPE(invlist) == SVt_INVLIST);
8185
8186     /* Add one to account for the zero element at the beginning which may not
8187      * be counted by the calling parameters */
8188     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8189 }
8190
8191 PERL_STATIC_INLINE void
8192 S_invlist_trim(SV* const invlist)
8193 {
8194     PERL_ARGS_ASSERT_INVLIST_TRIM;
8195
8196     assert(SvTYPE(invlist) == SVt_INVLIST);
8197
8198     /* Change the length of the inversion list to how many entries it currently
8199      * has */
8200     SvPV_shrink_to_cur((SV *) invlist);
8201 }
8202
8203 STATIC void
8204 S__append_range_to_invlist(pTHX_ SV* const invlist,
8205                                  const UV start, const UV end)
8206 {
8207    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8208     * the end of the inversion list.  The range must be above any existing
8209     * ones. */
8210
8211     UV* array;
8212     UV max = invlist_max(invlist);
8213     UV len = _invlist_len(invlist);
8214     bool offset;
8215
8216     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8217
8218     if (len == 0) { /* Empty lists must be initialized */
8219         offset = start != 0;
8220         array = _invlist_array_init(invlist, ! offset);
8221     }
8222     else {
8223         /* Here, the existing list is non-empty. The current max entry in the
8224          * list is generally the first value not in the set, except when the
8225          * set extends to the end of permissible values, in which case it is
8226          * the first entry in that final set, and so this call is an attempt to
8227          * append out-of-order */
8228
8229         UV final_element = len - 1;
8230         array = invlist_array(invlist);
8231         if (array[final_element] > start
8232             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8233         {
8234             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8235                      array[final_element], start,
8236                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8237         }
8238
8239         /* Here, it is a legal append.  If the new range begins with the first
8240          * value not in the set, it is extending the set, so the new first
8241          * value not in the set is one greater than the newly extended range.
8242          * */
8243         offset = *get_invlist_offset_addr(invlist);
8244         if (array[final_element] == start) {
8245             if (end != UV_MAX) {
8246                 array[final_element] = end + 1;
8247             }
8248             else {
8249                 /* But if the end is the maximum representable on the machine,
8250                  * just let the range that this would extend to have no end */
8251                 invlist_set_len(invlist, len - 1, offset);
8252             }
8253             return;
8254         }
8255     }
8256
8257     /* Here the new range doesn't extend any existing set.  Add it */
8258
8259     len += 2;   /* Includes an element each for the start and end of range */
8260
8261     /* If wll overflow the existing space, extend, which may cause the array to
8262      * be moved */
8263     if (max < len) {
8264         invlist_extend(invlist, len);
8265
8266         /* Have to set len here to avoid assert failure in invlist_array() */
8267         invlist_set_len(invlist, len, offset);
8268
8269         array = invlist_array(invlist);
8270     }
8271     else {
8272         invlist_set_len(invlist, len, offset);
8273     }
8274
8275     /* The next item on the list starts the range, the one after that is
8276      * one past the new range.  */
8277     array[len - 2] = start;
8278     if (end != UV_MAX) {
8279         array[len - 1] = end + 1;
8280     }
8281     else {
8282         /* But if the end is the maximum representable on the machine, just let
8283          * the range have no end */
8284         invlist_set_len(invlist, len - 1, offset);
8285     }
8286 }
8287
8288 #ifndef PERL_IN_XSUB_RE
8289
8290 IV
8291 Perl__invlist_search(SV* const invlist, const UV cp)
8292 {
8293     /* Searches the inversion list for the entry that contains the input code
8294      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8295      * return value is the index into the list's array of the range that
8296      * contains <cp> */
8297
8298     IV low = 0;
8299     IV mid;
8300     IV high = _invlist_len(invlist);
8301     const IV highest_element = high - 1;
8302     const UV* array;
8303
8304     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8305
8306     /* If list is empty, return failure. */
8307     if (high == 0) {
8308         return -1;
8309     }
8310
8311     /* (We can't get the array unless we know the list is non-empty) */
8312     array = invlist_array(invlist);
8313
8314     mid = invlist_previous_index(invlist);
8315     assert(mid >=0 && mid <= highest_element);
8316
8317     /* <mid> contains the cache of the result of the previous call to this
8318      * function (0 the first time).  See if this call is for the same result,
8319      * or if it is for mid-1.  This is under the theory that calls to this
8320      * function will often be for related code points that are near each other.
8321      * And benchmarks show that caching gives better results.  We also test
8322      * here if the code point is within the bounds of the list.  These tests
8323      * replace others that would have had to be made anyway to make sure that
8324      * the array bounds were not exceeded, and these give us extra information
8325      * at the same time */
8326     if (cp >= array[mid]) {
8327         if (cp >= array[highest_element]) {
8328             return highest_element;
8329         }
8330
8331         /* Here, array[mid] <= cp < array[highest_element].  This means that
8332          * the final element is not the answer, so can exclude it; it also
8333          * means that <mid> is not the final element, so can refer to 'mid + 1'
8334          * safely */
8335         if (cp < array[mid + 1]) {
8336             return mid;
8337         }
8338         high--;
8339         low = mid + 1;
8340     }
8341     else { /* cp < aray[mid] */
8342         if (cp < array[0]) { /* Fail if outside the array */
8343             return -1;
8344         }
8345         high = mid;
8346         if (cp >= array[mid - 1]) {
8347             goto found_entry;
8348         }
8349     }
8350
8351     /* Binary search.  What we are looking for is <i> such that
8352      *  array[i] <= cp < array[i+1]
8353      * The loop below converges on the i+1.  Note that there may not be an
8354      * (i+1)th element in the array, and things work nonetheless */
8355     while (low < high) {
8356         mid = (low + high) / 2;
8357         assert(mid <= highest_element);
8358         if (array[mid] <= cp) { /* cp >= array[mid] */
8359             low = mid + 1;
8360
8361             /* We could do this extra test to exit the loop early.
8362             if (cp < array[low]) {
8363                 return mid;
8364             }
8365             */
8366         }
8367         else { /* cp < array[mid] */
8368             high = mid;
8369         }
8370     }
8371
8372   found_entry:
8373     high--;
8374     invlist_set_previous_index(invlist, high);
8375     return high;
8376 }
8377
8378 void
8379 Perl__invlist_populate_swatch(SV* const invlist,
8380                               const UV start, const UV end, U8* swatch)
8381 {
8382     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8383      * but is used when the swash has an inversion list.  This makes this much
8384      * faster, as it uses a binary search instead of a linear one.  This is
8385      * intimately tied to that function, and perhaps should be in utf8.c,
8386      * except it is intimately tied to inversion lists as well.  It assumes
8387      * that <swatch> is all 0's on input */
8388
8389     UV current = start;
8390     const IV len = _invlist_len(invlist);
8391     IV i;
8392     const UV * array;
8393
8394     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8395
8396     if (len == 0) { /* Empty inversion list */
8397         return;
8398     }
8399
8400     array = invlist_array(invlist);
8401
8402     /* Find which element it is */
8403     i = _invlist_search(invlist, start);
8404
8405     /* We populate from <start> to <end> */
8406     while (current < end) {
8407         UV upper;
8408
8409         /* The inversion list gives the results for every possible code point
8410          * after the first one in the list.  Only those ranges whose index is
8411          * even are ones that the inversion list matches.  For the odd ones,
8412          * and if the initial code point is not in the list, we have to skip
8413          * forward to the next element */
8414         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8415             i++;
8416             if (i >= len) { /* Finished if beyond the end of the array */
8417                 return;
8418             }
8419             current = array[i];
8420             if (current >= end) {   /* Finished if beyond the end of what we
8421                                        are populating */
8422                 if (LIKELY(end < UV_MAX)) {
8423                     return;
8424                 }
8425
8426                 /* We get here when the upper bound is the maximum
8427                  * representable on the machine, and we are looking for just
8428                  * that code point.  Have to special case it */
8429                 i = len;
8430                 goto join_end_of_list;
8431             }
8432         }
8433         assert(current >= start);
8434
8435         /* The current range ends one below the next one, except don't go past
8436          * <end> */
8437         i++;
8438         upper = (i < len && array[i] < end) ? array[i] : end;
8439
8440         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8441          * for each code point in it */
8442         for (; current < upper; current++) {
8443             const STRLEN offset = (STRLEN)(current - start);
8444             swatch[offset >> 3] |= 1 << (offset & 7);
8445         }
8446
8447     join_end_of_list:
8448
8449         /* Quit if at the end of the list */
8450         if (i >= len) {
8451
8452             /* But first, have to deal with the highest possible code point on
8453              * the platform.  The previous code assumes that <end> is one
8454              * beyond where we want to populate, but that is impossible at the
8455              * platform's infinity, so have to handle it specially */
8456             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8457             {
8458                 const STRLEN offset = (STRLEN)(end - start);
8459                 swatch[offset >> 3] |= 1 << (offset & 7);
8460             }
8461             return;
8462         }
8463
8464         /* Advance to the next range, which will be for code points not in the
8465          * inversion list */
8466         current = array[i];
8467     }
8468
8469     return;
8470 }
8471
8472 void
8473 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8474                                          const bool complement_b, SV** output)
8475 {
8476     /* Take the union of two inversion lists and point <output> to it.  *output
8477      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8478      * the reference count to that list will be decremented if not already a
8479      * temporary (mortal); otherwise *output will be made correspondingly
8480      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8481      * second list is returned.  If <complement_b> is TRUE, the union is taken
8482      * of the complement (inversion) of <b> instead of b itself.
8483      *
8484      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8485      * Richard Gillam, published by Addison-Wesley, and explained at some
8486      * length there.  The preface says to incorporate its examples into your
8487      * code at your own risk.
8488      *
8489      * The algorithm is like a merge sort.
8490      *
8491      * XXX A potential performance improvement is to keep track as we go along
8492      * if only one of the inputs contributes to the result, meaning the other
8493      * is a subset of that one.  In that case, we can skip the final copy and
8494      * return the larger of the input lists, but then outside code might need
8495      * to keep track of whether to free the input list or not */
8496
8497     const UV* array_a;    /* a's array */
8498     const UV* array_b;
8499     UV len_a;       /* length of a's array */
8500     UV len_b;
8501
8502     SV* u;                      /* the resulting union */
8503     UV* array_u;
8504     UV len_u;
8505
8506     UV i_a = 0;             /* current index into a's array */
8507     UV i_b = 0;
8508     UV i_u = 0;
8509
8510     /* running count, as explained in the algorithm source book; items are
8511      * stopped accumulating and are output when the count changes to/from 0.
8512      * The count is incremented when we start a range that's in the set, and
8513      * decremented when we start a range that's not in the set.  So its range
8514      * is 0 to 2.  Only when the count is zero is something not in the set.
8515      */
8516     UV count = 0;
8517
8518     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8519     assert(a != b);
8520
8521     /* If either one is empty, the union is the other one */
8522     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8523         bool make_temp = FALSE; /* Should we mortalize the result? */
8524
8525         if (*output == a) {
8526             if (a != NULL) {
8527                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8528                     SvREFCNT_dec_NN(a);
8529                 }
8530             }
8531         }
8532         if (*output != b) {
8533             *output = invlist_clone(b);
8534             if (complement_b) {
8535                 _invlist_invert(*output);
8536             }
8537         } /* else *output already = b; */
8538
8539         if (make_temp) {
8540             sv_2mortal(*output);
8541         }
8542         return;
8543     }
8544     else if ((len_b = _invlist_len(b)) == 0) {
8545         bool make_temp = FALSE;
8546         if (*output == b) {
8547             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8548                 SvREFCNT_dec_NN(b);
8549             }
8550         }
8551
8552         /* The complement of an empty list is a list that has everything in it,
8553          * so the union with <a> includes everything too */
8554         if (complement_b) {
8555             if (a == *output) {
8556                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8557                     SvREFCNT_dec_NN(a);
8558                 }
8559             }
8560             *output = _new_invlist(1);
8561             _append_range_to_invlist(*output, 0, UV_MAX);
8562         }
8563         else if (*output != a) {
8564             *output = invlist_clone(a);
8565         }
8566         /* else *output already = a; */
8567
8568         if (make_temp) {
8569             sv_2mortal(*output);
8570         }
8571         return;
8572     }
8573
8574     /* Here both lists exist and are non-empty */
8575     array_a = invlist_array(a);
8576     array_b = invlist_array(b);
8577
8578     /* If are to take the union of 'a' with the complement of b, set it
8579      * up so are looking at b's complement. */
8580     if (complement_b) {
8581
8582         /* To complement, we invert: if the first element is 0, remove it.  To
8583          * do this, we just pretend the array starts one later */
8584         if (array_b[0] == 0) {
8585             array_b++;
8586             len_b--;
8587         }
8588         else {
8589
8590             /* But if the first element is not zero, we pretend the list starts
8591              * at the 0 that is always stored immediately before the array. */
8592             array_b--;
8593             len_b++;
8594         }
8595     }
8596
8597     /* Size the union for the worst case: that the sets are completely
8598      * disjoint */
8599     u = _new_invlist(len_a + len_b);
8600
8601     /* Will contain U+0000 if either component does */
8602     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8603                                       || (len_b > 0 && array_b[0] == 0));
8604
8605     /* Go through each list item by item, stopping when exhausted one of
8606      * them */
8607     while (i_a < len_a && i_b < len_b) {
8608         UV cp;      /* The element to potentially add to the union's array */
8609         bool cp_in_set;   /* is it in the the input list's set or not */
8610
8611         /* We need to take one or the other of the two inputs for the union.
8612          * Since we are merging two sorted lists, we take the smaller of the
8613          * next items.  In case of a tie, we take the one that is in its set
8614          * first.  If we took one not in the set first, it would decrement the
8615          * count, possibly to 0 which would cause it to be output as ending the
8616          * range, and the next time through we would take the same number, and
8617          * output it again as beginning the next range.  By doing it the
8618          * opposite way, there is no possibility that the count will be
8619          * momentarily decremented to 0, and thus the two adjoining ranges will
8620          * be seamlessly merged.  (In a tie and both are in the set or both not
8621          * in the set, it doesn't matter which we take first.) */
8622         if (array_a[i_a] < array_b[i_b]
8623             || (array_a[i_a] == array_b[i_b]
8624                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8625         {
8626             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8627             cp= array_a[i_a++];
8628         }
8629         else {
8630             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8631             cp = array_b[i_b++];
8632         }
8633
8634         /* Here, have chosen which of the two inputs to look at.  Only output
8635          * if the running count changes to/from 0, which marks the
8636          * beginning/end of a range in that's in the set */
8637         if (cp_in_set) {
8638             if (count == 0) {
8639                 array_u[i_u++] = cp;
8640             }
8641             count++;
8642         }
8643         else {
8644             count--;
8645             if (count == 0) {
8646                 array_u[i_u++] = cp;
8647             }
8648         }
8649     }
8650
8651     /* Here, we are finished going through at least one of the lists, which
8652      * means there is something remaining in at most one.  We check if the list
8653      * that hasn't been exhausted is positioned such that we are in the middle
8654      * of a range in its set or not.  (i_a and i_b point to the element beyond
8655      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8656      * is potentially more to output.
8657      * There are four cases:
8658      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8659      *     in the union is entirely from the non-exhausted set.
8660      *  2) Both were in their sets, count is 2.  Nothing further should
8661      *     be output, as everything that remains will be in the exhausted
8662      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8663      *     that
8664      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8665      *     Nothing further should be output because the union includes
8666      *     everything from the exhausted set.  Not decrementing ensures that.
8667      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8668      *     decrementing to 0 insures that we look at the remainder of the
8669      *     non-exhausted set */
8670     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8671         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8672     {
8673         count--;
8674     }
8675
8676     /* The final length is what we've output so far, plus what else is about to
8677      * be output.  (If 'count' is non-zero, then the input list we exhausted
8678      * has everything remaining up to the machine's limit in its set, and hence
8679      * in the union, so there will be no further output. */
8680     len_u = i_u;
8681     if (count == 0) {
8682         /* At most one of the subexpressions will be non-zero */
8683         len_u += (len_a - i_a) + (len_b - i_b);
8684     }
8685
8686     /* Set result to final length, which can change the pointer to array_u, so
8687      * re-find it */
8688     if (len_u != _invlist_len(u)) {
8689         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8690         invlist_trim(u);
8691         array_u = invlist_array(u);
8692     }
8693
8694     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8695      * the other) ended with everything above it not in its set.  That means
8696      * that the remaining part of the union is precisely the same as the
8697      * non-exhausted list, so can just copy it unchanged.  (If both list were
8698      * exhausted at the same time, then the operations below will be both 0.)
8699      */
8700     if (count == 0) {
8701         IV copy_count; /* At most one will have a non-zero copy count */
8702         if ((copy_count = len_a - i_a) > 0) {
8703             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8704         }
8705         else if ((copy_count = len_b - i_b) > 0) {
8706             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8707         }
8708     }
8709
8710     /*  We may be removing a reference to one of the inputs.  If so, the output
8711      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8712      *  count decremented) */
8713     if (a == *output || b == *output) {
8714         assert(! invlist_is_iterating(*output));
8715         if ((SvTEMP(*output))) {
8716             sv_2mortal(u);
8717         }
8718         else {
8719             SvREFCNT_dec_NN(*output);
8720         }
8721     }
8722
8723     *output = u;
8724
8725     return;
8726 }
8727
8728 void
8729 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8730                                                const bool complement_b, SV** i)
8731 {
8732     /* Take the intersection of two inversion lists and point <i> to it.  *i
8733      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8734      * the reference count to that list will be decremented if not already a
8735      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8736      * The first list, <a>, may be NULL, in which case an empty list is
8737      * returned.  If <complement_b> is TRUE, the result will be the
8738      * intersection of <a> and the complement (or inversion) of <b> instead of
8739      * <b> directly.
8740      *
8741      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8742      * Richard Gillam, published by Addison-Wesley, and explained at some
8743      * length there.  The preface says to incorporate its examples into your
8744      * code at your own risk.  In fact, it had bugs
8745      *
8746      * The algorithm is like a merge sort, and is essentially the same as the
8747      * union above
8748      */
8749
8750     const UV* array_a;          /* a's array */
8751     const UV* array_b;
8752     UV len_a;   /* length of a's array */
8753     UV len_b;
8754
8755     SV* r;                   /* the resulting intersection */
8756     UV* array_r;
8757     UV len_r;
8758
8759     UV i_a = 0;             /* current index into a's array */
8760     UV i_b = 0;
8761     UV i_r = 0;
8762
8763     /* running count, as explained in the algorithm source book; items are
8764      * stopped accumulating and are output when the count changes to/from 2.
8765      * The count is incremented when we start a range that's in the set, and
8766      * decremented when we start a range that's not in the set.  So its range
8767      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8768      */
8769     UV count = 0;
8770
8771     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8772     assert(a != b);
8773
8774     /* Special case if either one is empty */
8775     len_a = (a == NULL) ? 0 : _invlist_len(a);
8776     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8777         bool make_temp = FALSE;
8778
8779         if (len_a != 0 && complement_b) {
8780
8781             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8782              * be empty.  Here, also we are using 'b's complement, which hence
8783              * must be every possible code point.  Thus the intersection is
8784              * simply 'a'. */
8785             if (*i != a) {
8786                 if (*i == b) {
8787                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8788                         SvREFCNT_dec_NN(b);
8789                     }
8790                 }
8791
8792                 *i = invlist_clone(a);
8793             }
8794             /* else *i is already 'a' */
8795
8796             if (make_temp) {
8797                 sv_2mortal(*i);
8798             }
8799             return;
8800         }
8801
8802         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8803          * intersection must be empty */
8804         if (*i == a) {
8805             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8806                 SvREFCNT_dec_NN(a);
8807             }
8808         }
8809         else if (*i == b) {
8810             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8811                 SvREFCNT_dec_NN(b);
8812             }
8813         }
8814         *i = _new_invlist(0);
8815         if (make_temp) {
8816             sv_2mortal(*i);
8817         }
8818
8819         return;
8820     }
8821
8822     /* Here both lists exist and are non-empty */
8823     array_a = invlist_array(a);
8824     array_b = invlist_array(b);
8825
8826     /* If are to take the intersection of 'a' with the complement of b, set it
8827      * up so are looking at b's complement. */
8828     if (complement_b) {
8829
8830         /* To complement, we invert: if the first element is 0, remove it.  To
8831          * do this, we just pretend the array starts one later */
8832         if (array_b[0] == 0) {
8833             array_b++;
8834             len_b--;
8835         }
8836         else {
8837
8838             /* But if the first element is not zero, we pretend the list starts
8839              * at the 0 that is always stored immediately before the array. */
8840             array_b--;
8841             len_b++;
8842         }
8843     }
8844
8845     /* Size the intersection for the worst case: that the intersection ends up
8846      * fragmenting everything to be completely disjoint */
8847     r= _new_invlist(len_a + len_b);
8848
8849     /* Will contain U+0000 iff both components do */
8850     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8851                                      && len_b > 0 && array_b[0] == 0);
8852
8853     /* Go through each list item by item, stopping when exhausted one of
8854      * them */
8855     while (i_a < len_a && i_b < len_b) {
8856         UV cp;      /* The element to potentially add to the intersection's
8857                        array */
8858         bool cp_in_set; /* Is it in the input list's set or not */
8859
8860         /* We need to take one or the other of the two inputs for the
8861          * intersection.  Since we are merging two sorted lists, we take the
8862          * smaller of the next items.  In case of a tie, we take the one that
8863          * is not in its set first (a difference from the union algorithm).  If
8864          * we took one in the set first, it would increment the count, possibly
8865          * to 2 which would cause it to be output as starting a range in the
8866          * intersection, and the next time through we would take that same
8867          * number, and output it again as ending the set.  By doing it the
8868          * opposite of this, there is no possibility that the count will be
8869          * momentarily incremented to 2.  (In a tie and both are in the set or
8870          * both not in the set, it doesn't matter which we take first.) */
8871         if (array_a[i_a] < array_b[i_b]
8872             || (array_a[i_a] == array_b[i_b]
8873                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8874         {
8875             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8876             cp= array_a[i_a++];
8877         }
8878         else {
8879             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8880             cp= array_b[i_b++];
8881         }
8882
8883         /* Here, have chosen which of the two inputs to look at.  Only output
8884          * if the running count changes to/from 2, which marks the
8885          * beginning/end of a range that's in the intersection */
8886         if (cp_in_set) {
8887             count++;
8888             if (count == 2) {
8889                 array_r[i_r++] = cp;
8890             }
8891         }
8892         else {
8893             if (count == 2) {
8894                 array_r[i_r++] = cp;
8895             }
8896             count--;
8897         }
8898     }
8899
8900     /* Here, we are finished going through at least one of the lists, which
8901      * means there is something remaining in at most one.  We check if the list
8902      * that has been exhausted is positioned such that we are in the middle
8903      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8904      * the ones we care about.)  There are four cases:
8905      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8906      *     nothing left in the intersection.
8907      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8908      *     above 2.  What should be output is exactly that which is in the
8909      *     non-exhausted set, as everything it has is also in the intersection
8910      *     set, and everything it doesn't have can't be in the intersection
8911      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8912      *     gets incremented to 2.  Like the previous case, the intersection is
8913      *     everything that remains in the non-exhausted set.
8914      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8915      *     remains 1.  And the intersection has nothing more. */
8916     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8917         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8918     {
8919         count++;
8920     }
8921
8922     /* The final length is what we've output so far plus what else is in the
8923      * intersection.  At most one of the subexpressions below will be non-zero
8924      * */
8925     len_r = i_r;
8926     if (count >= 2) {
8927         len_r += (len_a - i_a) + (len_b - i_b);
8928     }
8929
8930     /* Set result to final length, which can change the pointer to array_r, so
8931      * re-find it */
8932     if (len_r != _invlist_len(r)) {
8933         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8934         invlist_trim(r);
8935         array_r = invlist_array(r);
8936     }
8937
8938     /* Finish outputting any remaining */
8939     if (count >= 2) { /* At most one will have a non-zero copy count */
8940         IV copy_count;
8941         if ((copy_count = len_a - i_a) > 0) {
8942             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8943         }
8944         else if ((copy_count = len_b - i_b) > 0) {
8945             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8946         }
8947     }
8948
8949     /*  We may be removing a reference to one of the inputs.  If so, the output
8950      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8951      *  count decremented) */
8952     if (a == *i || b == *i) {
8953         assert(! invlist_is_iterating(*i));
8954         if (SvTEMP(*i)) {
8955             sv_2mortal(r);
8956         }
8957         else {
8958             SvREFCNT_dec_NN(*i);
8959         }
8960     }
8961
8962     *i = r;
8963
8964     return;
8965 }
8966
8967 SV*
8968 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8969 {
8970     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8971      * set.  A pointer to the inversion list is returned.  This may actually be
8972      * a new list, in which case the passed in one has been destroyed.  The
8973      * passed in inversion list can be NULL, in which case a new one is created
8974      * with just the one range in it */
8975
8976     SV* range_invlist;
8977     UV len;
8978
8979     if (invlist == NULL) {
8980         invlist = _new_invlist(2);
8981         len = 0;
8982     }
8983     else {
8984         len = _invlist_len(invlist);
8985     }
8986
8987     /* If comes after the final entry actually in the list, can just append it
8988      * to the end, */
8989     if (len == 0
8990         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8991             && start >= invlist_array(invlist)[len - 1]))
8992     {
8993         _append_range_to_invlist(invlist, start, end);
8994         return invlist;
8995     }
8996
8997     /* Here, can't just append things, create and return a new inversion list
8998      * which is the union of this range and the existing inversion list */
8999     range_invlist = _new_invlist(2);
9000     _append_range_to_invlist(range_invlist, start, end);
9001
9002     _invlist_union(invlist, range_invlist, &invlist);
9003
9004     /* The temporary can be freed */
9005     SvREFCNT_dec_NN(range_invlist);
9006
9007     return invlist;
9008 }
9009
9010 SV*
9011 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9012                                  UV** other_elements_ptr)
9013 {
9014     /* Create and return an inversion list whose contents are to be populated
9015      * by the caller.  The caller gives the number of elements (in 'size') and
9016      * the very first element ('element0').  This function will set
9017      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9018      * are to be placed.
9019      *
9020      * Obviously there is some trust involved that the caller will properly
9021      * fill in the other elements of the array.
9022      *
9023      * (The first element needs to be passed in, as the underlying code does
9024      * things differently depending on whether it is zero or non-zero) */
9025
9026     SV* invlist = _new_invlist(size);
9027     bool offset;
9028
9029     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9030
9031     _append_range_to_invlist(invlist, element0, element0);
9032     offset = *get_invlist_offset_addr(invlist);
9033
9034     invlist_set_len(invlist, size, offset);
9035     *other_elements_ptr = invlist_array(invlist) + 1;
9036     return invlist;
9037 }
9038
9039 #endif
9040
9041 PERL_STATIC_INLINE SV*
9042 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9043     return _add_range_to_invlist(invlist, cp, cp);
9044 }
9045
9046 #ifndef PERL_IN_XSUB_RE
9047 void
9048 Perl__invlist_invert(pTHX_ SV* const invlist)
9049 {
9050     /* Complement the input inversion list.  This adds a 0 if the list didn't
9051      * have a zero; removes it otherwise.  As described above, the data
9052      * structure is set up so that this is very efficient */
9053
9054     PERL_ARGS_ASSERT__INVLIST_INVERT;
9055
9056     assert(! invlist_is_iterating(invlist));
9057
9058     /* The inverse of matching nothing is matching everything */
9059     if (_invlist_len(invlist) == 0) {
9060         _append_range_to_invlist(invlist, 0, UV_MAX);
9061         return;
9062     }
9063
9064     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9065 }
9066
9067 #endif
9068
9069 PERL_STATIC_INLINE SV*
9070 S_invlist_clone(pTHX_ SV* const invlist)
9071 {
9072
9073     /* Return a new inversion list that is a copy of the input one, which is
9074      * unchanged.  The new list will not be mortal even if the old one was. */
9075
9076     /* Need to allocate extra space to accommodate Perl's addition of a
9077      * trailing NUL to SvPV's, since it thinks they are always strings */
9078     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9079     STRLEN physical_length = SvCUR(invlist);
9080     bool offset = *(get_invlist_offset_addr(invlist));
9081
9082     PERL_ARGS_ASSERT_INVLIST_CLONE;
9083
9084     *(get_invlist_offset_addr(new_invlist)) = offset;
9085     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9086     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9087
9088     return new_invlist;
9089 }
9090
9091 PERL_STATIC_INLINE STRLEN*
9092 S_get_invlist_iter_addr(SV* invlist)
9093 {
9094     /* Return the address of the UV that contains the current iteration
9095      * position */
9096
9097     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9098
9099     assert(SvTYPE(invlist) == SVt_INVLIST);
9100
9101     return &(((XINVLIST*) SvANY(invlist))->iterator);
9102 }
9103
9104 PERL_STATIC_INLINE void
9105 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9106 {
9107     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9108
9109     *get_invlist_iter_addr(invlist) = 0;
9110 }
9111
9112 PERL_STATIC_INLINE void
9113 S_invlist_iterfinish(SV* invlist)
9114 {
9115     /* Terminate iterator for invlist.  This is to catch development errors.
9116      * Any iteration that is interrupted before completed should call this
9117      * function.  Functions that add code points anywhere else but to the end
9118      * of an inversion list assert that they are not in the middle of an
9119      * iteration.  If they were, the addition would make the iteration
9120      * problematical: if the iteration hadn't reached the place where things
9121      * were being added, it would be ok */
9122
9123     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9124
9125     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9126 }
9127
9128 STATIC bool
9129 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9130 {
9131     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9132      * This call sets in <*start> and <*end>, the next range in <invlist>.
9133      * Returns <TRUE> if successful and the next call will return the next
9134      * range; <FALSE> if was already at the end of the list.  If the latter,
9135      * <*start> and <*end> are unchanged, and the next call to this function
9136      * will start over at the beginning of the list */
9137
9138     STRLEN* pos = get_invlist_iter_addr(invlist);
9139     UV len = _invlist_len(invlist);
9140     UV *array;
9141
9142     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9143
9144     if (*pos >= len) {
9145         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9146         return FALSE;
9147     }
9148
9149     array = invlist_array(invlist);
9150
9151     *start = array[(*pos)++];
9152
9153     if (*pos >= len) {
9154         *end = UV_MAX;
9155     }
9156     else {
9157         *end = array[(*pos)++] - 1;
9158     }
9159
9160     return TRUE;
9161 }
9162
9163 PERL_STATIC_INLINE bool
9164 S_invlist_is_iterating(SV* const invlist)
9165 {
9166     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9167
9168     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9169 }
9170
9171 PERL_STATIC_INLINE UV
9172 S_invlist_highest(SV* const invlist)
9173 {
9174     /* Returns the highest code point that matches an inversion list.  This API
9175      * has an ambiguity, as it returns 0 under either the highest is actually
9176      * 0, or if the list is empty.  If this distinction matters to you, check
9177      * for emptiness before calling this function */
9178
9179     UV len = _invlist_len(invlist);
9180     UV *array;
9181
9182     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9183
9184     if (len == 0) {
9185         return 0;
9186     }
9187
9188     array = invlist_array(invlist);
9189
9190     /* The last element in the array in the inversion list always starts a
9191      * range that goes to infinity.  That range may be for code points that are
9192      * matched in the inversion list, or it may be for ones that aren't
9193      * matched.  In the latter case, the highest code point in the set is one
9194      * less than the beginning of this range; otherwise it is the final element
9195      * of this range: infinity */
9196     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9197            ? UV_MAX
9198            : array[len - 1] - 1;
9199 }
9200
9201 #ifndef PERL_IN_XSUB_RE
9202 SV *
9203 Perl__invlist_contents(pTHX_ SV* const invlist)
9204 {
9205     /* Get the contents of an inversion list into a string SV so that they can
9206      * be printed out.  It uses the format traditionally done for debug tracing
9207      */
9208
9209     UV start, end;
9210     SV* output = newSVpvs("\n");
9211
9212     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9213
9214     assert(! invlist_is_iterating(invlist));
9215
9216     invlist_iterinit(invlist);
9217     while (invlist_iternext(invlist, &start, &end)) {
9218         if (end == UV_MAX) {
9219             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9220         }
9221         else if (end != start) {
9222             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9223                     start,       end);
9224         }
9225         else {
9226             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9227         }
9228     }
9229
9230     return output;
9231 }
9232 #endif
9233
9234 #ifndef PERL_IN_XSUB_RE
9235 void
9236 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9237                          const char * const indent, SV* const invlist)
9238 {
9239     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9240      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9241      * the string 'indent'.  The output looks like this:
9242          [0] 0x000A .. 0x000D
9243          [2] 0x0085
9244          [4] 0x2028 .. 0x2029
9245          [6] 0x3104 .. INFINITY
9246      * This means that the first range of code points matched by the list are
9247      * 0xA through 0xD; the second range contains only the single code point
9248      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9249      * are used to define each range (except if the final range extends to
9250      * infinity, only a single element is needed).  The array index of the
9251      * first element for the corresponding range is given in brackets. */
9252
9253     UV start, end;
9254     STRLEN count = 0;
9255
9256     PERL_ARGS_ASSERT__INVLIST_DUMP;
9257
9258     if (invlist_is_iterating(invlist)) {
9259         Perl_dump_indent(aTHX_ level, file,
9260              "%sCan't dump inversion list because is in middle of iterating\n",
9261              indent);
9262         return;
9263     }
9264
9265     invlist_iterinit(invlist);
9266     while (invlist_iternext(invlist, &start, &end)) {
9267         if (end == UV_MAX) {
9268             Perl_dump_indent(aTHX_ level, file,
9269                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9270                                    indent, (UV)count, start);
9271         }
9272         else if (end != start) {
9273             Perl_dump_indent(aTHX_ level, file,
9274                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9275                                 indent, (UV)count, start,         end);
9276         }
9277         else {
9278             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9279                                             indent, (UV)count, start);
9280         }
9281         count += 2;
9282     }
9283 }
9284
9285 void
9286 Perl__load_PL_utf8_foldclosures (pTHX)
9287 {
9288     assert(! PL_utf8_foldclosures);
9289
9290     /* If the folds haven't been read in, call a fold function
9291      * to force that */
9292     if (! PL_utf8_tofold) {
9293         U8 dummy[UTF8_MAXBYTES_CASE+1];
9294
9295         /* This string is just a short named one above \xff */
9296         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9297         assert(PL_utf8_tofold); /* Verify that worked */
9298     }
9299     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9300 }
9301 #endif
9302
9303 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9304 bool
9305 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9306 {
9307     /* Return a boolean as to if the two passed in inversion lists are
9308      * identical.  The final argument, if TRUE, says to take the complement of
9309      * the second inversion list before doing the comparison */
9310
9311     const UV* array_a = invlist_array(a);
9312     const UV* array_b = invlist_array(b);
9313     UV len_a = _invlist_len(a);
9314     UV len_b = _invlist_len(b);
9315
9316     UV i = 0;               /* current index into the arrays */
9317     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9318
9319     PERL_ARGS_ASSERT__INVLISTEQ;
9320
9321     /* If are to compare 'a' with the complement of b, set it
9322      * up so are looking at b's complement. */
9323     if (complement_b) {
9324
9325         /* The complement of nothing is everything, so <a> would have to have
9326          * just one element, starting at zero (ending at infinity) */
9327         if (len_b == 0) {
9328             return (len_a == 1 && array_a[0] == 0);
9329         }
9330         else if (array_b[0] == 0) {
9331
9332             /* Otherwise, to complement, we invert.  Here, the first element is
9333              * 0, just remove it.  To do this, we just pretend the array starts
9334              * one later */
9335
9336             array_b++;
9337             len_b--;
9338         }
9339         else {
9340
9341             /* But if the first element is not zero, we pretend the list starts
9342              * at the 0 that is always stored immediately before the array. */
9343             array_b--;
9344             len_b++;
9345         }
9346     }
9347
9348     /* Make sure that the lengths are the same, as well as the final element
9349      * before looping through the remainder.  (Thus we test the length, final,
9350      * and first elements right off the bat) */
9351     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9352         retval = FALSE;
9353     }
9354     else for (i = 0; i < len_a - 1; i++) {
9355         if (array_a[i] != array_b[i]) {
9356             retval = FALSE;
9357             break;
9358         }
9359     }
9360
9361     return retval;
9362 }
9363 #endif
9364
9365 #undef HEADER_LENGTH
9366 #undef TO_INTERNAL_SIZE
9367 #undef FROM_INTERNAL_SIZE
9368 #undef INVLIST_VERSION_ID
9369
9370 /* End of inversion list object */
9371
9372 STATIC void
9373 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9374 {
9375     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9376      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9377      * should point to the first flag; it is updated on output to point to the
9378      * final ')' or ':'.  There needs to be at least one flag, or this will
9379      * abort */
9380
9381     /* for (?g), (?gc), and (?o) warnings; warning
9382        about (?c) will warn about (?g) -- japhy    */
9383
9384 #define WASTED_O  0x01
9385 #define WASTED_G  0x02
9386 #define WASTED_C  0x04
9387 #define WASTED_GC (WASTED_G|WASTED_C)
9388     I32 wastedflags = 0x00;
9389     U32 posflags = 0, negflags = 0;
9390     U32 *flagsp = &posflags;
9391     char has_charset_modifier = '\0';
9392     regex_charset cs;
9393     bool has_use_defaults = FALSE;
9394     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9395     int x_mod_count = 0;
9396
9397     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9398
9399     /* '^' as an initial flag sets certain defaults */
9400     if (UCHARAT(RExC_parse) == '^') {
9401         RExC_parse++;
9402         has_use_defaults = TRUE;
9403         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9404         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9405                                         ? REGEX_UNICODE_CHARSET
9406                                         : REGEX_DEPENDS_CHARSET);
9407     }
9408
9409     cs = get_regex_charset(RExC_flags);
9410     if (cs == REGEX_DEPENDS_CHARSET
9411         && (RExC_utf8 || RExC_uni_semantics))
9412     {
9413         cs = REGEX_UNICODE_CHARSET;
9414     }
9415
9416     while (*RExC_parse) {
9417         /* && strchr("iogcmsx", *RExC_parse) */
9418         /* (?g), (?gc) and (?o) are useless here
9419            and must be globally applied -- japhy */
9420         switch (*RExC_parse) {
9421
9422             /* Code for the imsx flags */
9423             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9424
9425             case LOCALE_PAT_MOD:
9426                 if (has_charset_modifier) {
9427                     goto excess_modifier;
9428                 }
9429                 else if (flagsp == &negflags) {
9430                     goto neg_modifier;
9431                 }
9432                 cs = REGEX_LOCALE_CHARSET;
9433                 has_charset_modifier = LOCALE_PAT_MOD;
9434                 break;
9435             case UNICODE_PAT_MOD:
9436                 if (has_charset_modifier) {
9437                     goto excess_modifier;
9438                 }
9439                 else if (flagsp == &negflags) {
9440                     goto neg_modifier;
9441                 }
9442                 cs = REGEX_UNICODE_CHARSET;
9443                 has_charset_modifier = UNICODE_PAT_MOD;
9444                 break;
9445             case ASCII_RESTRICT_PAT_MOD:
9446                 if (flagsp == &negflags) {
9447                     goto neg_modifier;
9448                 }
9449                 if (has_charset_modifier) {
9450                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9451                         goto excess_modifier;
9452                     }
9453                     /* Doubled modifier implies more restricted */
9454                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9455                 }
9456                 else {
9457                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9458                 }
9459                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9460                 break;
9461             case DEPENDS_PAT_MOD:
9462                 if (has_use_defaults) {
9463                     goto fail_modifiers;
9464                 }
9465                 else if (flagsp == &negflags) {
9466                     goto neg_modifier;
9467                 }
9468                 else if (has_charset_modifier) {
9469                     goto excess_modifier;
9470                 }
9471
9472                 /* The dual charset means unicode semantics if the
9473                  * pattern (or target, not known until runtime) are
9474                  * utf8, or something in the pattern indicates unicode
9475                  * semantics */
9476                 cs = (RExC_utf8 || RExC_uni_semantics)
9477                      ? REGEX_UNICODE_CHARSET
9478                      : REGEX_DEPENDS_CHARSET;
9479                 has_charset_modifier = DEPENDS_PAT_MOD;
9480                 break;
9481             excess_modifier:
9482                 RExC_parse++;
9483                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9484                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9485                 }
9486                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9487                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9488                                         *(RExC_parse - 1));
9489                 }
9490                 else {
9491                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9492                 }
9493                 /*NOTREACHED*/
9494             neg_modifier:
9495                 RExC_parse++;
9496                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9497                                     *(RExC_parse - 1));
9498                 /*NOTREACHED*/
9499             case ONCE_PAT_MOD: /* 'o' */
9500             case GLOBAL_PAT_MOD: /* 'g' */
9501                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9502                     const I32 wflagbit = *RExC_parse == 'o'
9503                                          ? WASTED_O
9504                                          : WASTED_G;
9505                     if (! (wastedflags & wflagbit) ) {
9506                         wastedflags |= wflagbit;
9507                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9508                         vWARN5(
9509                             RExC_parse + 1,
9510                             "Useless (%s%c) - %suse /%c modifier",
9511                             flagsp == &negflags ? "?-" : "?",
9512                             *RExC_parse,
9513                             flagsp == &negflags ? "don't " : "",
9514                             *RExC_parse
9515                         );
9516                     }
9517                 }
9518                 break;
9519
9520             case CONTINUE_PAT_MOD: /* 'c' */
9521                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9522                     if (! (wastedflags & WASTED_C) ) {
9523                         wastedflags |= WASTED_GC;
9524                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9525                         vWARN3(
9526                             RExC_parse + 1,
9527                             "Useless (%sc) - %suse /gc modifier",
9528                             flagsp == &negflags ? "?-" : "?",
9529                             flagsp == &negflags ? "don't " : ""
9530                         );
9531                     }
9532                 }
9533                 break;
9534             case KEEPCOPY_PAT_MOD: /* 'p' */
9535                 if (flagsp == &negflags) {
9536                     if (PASS2)
9537                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9538                 } else {
9539                     *flagsp |= RXf_PMf_KEEPCOPY;
9540                 }
9541                 break;
9542             case '-':
9543                 /* A flag is a default iff it is following a minus, so
9544                  * if there is a minus, it means will be trying to
9545                  * re-specify a default which is an error */
9546                 if (has_use_defaults || flagsp == &negflags) {
9547                     goto fail_modifiers;
9548                 }
9549                 flagsp = &negflags;
9550                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9551                 break;
9552             case ':':
9553             case ')':
9554                 RExC_flags |= posflags;
9555                 RExC_flags &= ~negflags;
9556                 set_regex_charset(&RExC_flags, cs);
9557                 if (RExC_flags & RXf_PMf_FOLD) {
9558                     RExC_contains_i = 1;
9559                 }
9560                 if (PASS2) {
9561                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9562                 }
9563                 return;
9564                 /*NOTREACHED*/
9565             default:
9566             fail_modifiers:
9567                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9568                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9569                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9570                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9571                 /*NOTREACHED*/
9572         }
9573
9574         ++RExC_parse;
9575     }
9576
9577     if (PASS2) {
9578         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9579     }
9580 }
9581
9582 /*
9583  - reg - regular expression, i.e. main body or parenthesized thing
9584  *
9585  * Caller must absorb opening parenthesis.
9586  *
9587  * Combining parenthesis handling with the base level of regular expression
9588  * is a trifle forced, but the need to tie the tails of the branches to what
9589  * follows makes it hard to avoid.
9590  */
9591 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9592 #ifdef DEBUGGING
9593 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9594 #else
9595 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9596 #endif
9597
9598 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9599    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9600    needs to be restarted.
9601    Otherwise would only return NULL if regbranch() returns NULL, which
9602    cannot happen.  */
9603 STATIC regnode *
9604 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9605     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9606      * 2 is like 1, but indicates that nextchar() has been called to advance
9607      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9608      * this flag alerts us to the need to check for that */
9609 {
9610     regnode *ret;               /* Will be the head of the group. */
9611     regnode *br;
9612     regnode *lastbr;
9613     regnode *ender = NULL;
9614     I32 parno = 0;
9615     I32 flags;
9616     U32 oregflags = RExC_flags;
9617     bool have_branch = 0;
9618     bool is_open = 0;
9619     I32 freeze_paren = 0;
9620     I32 after_freeze = 0;
9621     I32 num; /* numeric backreferences */
9622
9623     char * parse_start = RExC_parse; /* MJD */
9624     char * const oregcomp_parse = RExC_parse;
9625
9626     GET_RE_DEBUG_FLAGS_DECL;
9627
9628     PERL_ARGS_ASSERT_REG;
9629     DEBUG_PARSE("reg ");
9630
9631     *flagp = 0;                         /* Tentatively. */
9632
9633
9634     /* Make an OPEN node, if parenthesized. */
9635     if (paren) {
9636
9637         /* Under /x, space and comments can be gobbled up between the '(' and
9638          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9639          * intervening space, as the sequence is a token, and a token should be
9640          * indivisible */
9641         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9642
9643         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9644             char *start_verb = RExC_parse;
9645             STRLEN verb_len = 0;
9646             char *start_arg = NULL;
9647             unsigned char op = 0;
9648             int argok = 1;
9649             int internal_argval = 0; /* internal_argval is only useful if
9650                                         !argok */
9651
9652             if (has_intervening_patws) {
9653                 RExC_parse++;
9654                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9655             }
9656             while ( *RExC_parse && *RExC_parse != ')' ) {
9657                 if ( *RExC_parse == ':' ) {
9658                     start_arg = RExC_parse + 1;
9659                     break;
9660                 }
9661                 RExC_parse++;
9662             }
9663             ++start_verb;
9664             verb_len = RExC_parse - start_verb;
9665             if ( start_arg ) {
9666                 RExC_parse++;
9667                 while ( *RExC_parse && *RExC_parse != ')' )
9668                     RExC_parse++;
9669                 if ( *RExC_parse != ')' )
9670                     vFAIL("Unterminated verb pattern argument");
9671                 if ( RExC_parse == start_arg )
9672                     start_arg = NULL;
9673             } else {
9674                 if ( *RExC_parse != ')' )
9675                     vFAIL("Unterminated verb pattern");
9676             }
9677
9678             switch ( *start_verb ) {
9679             case 'A':  /* (*ACCEPT) */
9680                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9681                     op = ACCEPT;
9682                     internal_argval = RExC_nestroot;
9683                 }
9684                 break;
9685             case 'C':  /* (*COMMIT) */
9686                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9687                     op = COMMIT;
9688                 break;
9689             case 'F':  /* (*FAIL) */
9690                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9691                     op = OPFAIL;
9692                     argok = 0;
9693                 }
9694                 break;
9695             case ':':  /* (*:NAME) */
9696             case 'M':  /* (*MARK:NAME) */
9697                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9698                     op = MARKPOINT;
9699                     argok = -1;
9700                 }
9701                 break;
9702             case 'P':  /* (*PRUNE) */
9703                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9704                     op = PRUNE;
9705                 break;
9706             case 'S':   /* (*SKIP) */
9707                 if ( memEQs(start_verb,verb_len,"SKIP") )
9708                     op = SKIP;
9709                 break;
9710             case 'T':  /* (*THEN) */
9711                 /* [19:06] <TimToady> :: is then */
9712                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9713                     op = CUTGROUP;
9714                     RExC_seen |= REG_CUTGROUP_SEEN;
9715                 }
9716                 break;
9717             }
9718             if ( ! op ) {
9719                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9720                 vFAIL2utf8f(
9721                     "Unknown verb pattern '%"UTF8f"'",
9722                     UTF8fARG(UTF, verb_len, start_verb));
9723             }
9724             if ( argok ) {
9725                 if ( start_arg && internal_argval ) {
9726                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9727                         verb_len, start_verb);
9728                 } else if ( argok < 0 && !start_arg ) {
9729                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9730                         verb_len, start_verb);
9731                 } else {
9732                     ret = reganode(pRExC_state, op, internal_argval);
9733                     if ( ! internal_argval && ! SIZE_ONLY ) {
9734                         if (start_arg) {
9735                             SV *sv = newSVpvn( start_arg,
9736                                                RExC_parse - start_arg);
9737                             ARG(ret) = add_data( pRExC_state,
9738                                                  STR_WITH_LEN("S"));
9739                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9740                             ret->flags = 0;
9741                         } else {
9742                             ret->flags = 1;
9743                         }
9744                     }
9745                 }
9746                 if (!internal_argval)
9747                     RExC_seen |= REG_VERBARG_SEEN;
9748             } else if ( start_arg ) {
9749                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9750                         verb_len, start_verb);
9751             } else {
9752                 ret = reg_node(pRExC_state, op);
9753             }
9754             nextchar(pRExC_state);
9755             return ret;
9756         }
9757         else if (*RExC_parse == '?') { /* (?...) */
9758             bool is_logical = 0;
9759             const char * const seqstart = RExC_parse;
9760             const char * endptr;
9761             if (has_intervening_patws) {
9762                 RExC_parse++;
9763                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9764             }
9765
9766             RExC_parse++;
9767             paren = *RExC_parse++;
9768             ret = NULL;                 /* For look-ahead/behind. */
9769             switch (paren) {
9770
9771             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9772                 paren = *RExC_parse++;
9773                 if ( paren == '<')         /* (?P<...>) named capture */
9774                     goto named_capture;
9775                 else if (paren == '>') {   /* (?P>name) named recursion */
9776                     goto named_recursion;
9777                 }
9778                 else if (paren == '=') {   /* (?P=...)  named backref */
9779                     /* this pretty much dupes the code for \k<NAME> in
9780                      * regatom(), if you change this make sure you change that
9781                      * */
9782                     char* name_start = RExC_parse;
9783                     U32 num = 0;
9784                     SV *sv_dat = reg_scan_name(pRExC_state,
9785                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9786                     if (RExC_parse == name_start || *RExC_parse != ')')
9787                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9788                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9789
9790                     if (!SIZE_ONLY) {
9791                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9792                         RExC_rxi->data->data[num]=(void*)sv_dat;
9793                         SvREFCNT_inc_simple_void(sv_dat);
9794                     }
9795                     RExC_sawback = 1;
9796                     ret = reganode(pRExC_state,
9797                                    ((! FOLD)
9798                                      ? NREF
9799                                      : (ASCII_FOLD_RESTRICTED)
9800                                        ? NREFFA
9801                                        : (AT_LEAST_UNI_SEMANTICS)
9802                                          ? NREFFU
9803                                          : (LOC)
9804                                            ? NREFFL
9805                                            : NREFF),
9806                                     num);
9807                     *flagp |= HASWIDTH;
9808
9809                     Set_Node_Offset(ret, parse_start+1);
9810                     Set_Node_Cur_Length(ret, parse_start);
9811
9812                     nextchar(pRExC_state);
9813                     return ret;
9814                 }
9815                 RExC_parse++;
9816                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9817                 vFAIL3("Sequence (%.*s...) not recognized",
9818                                 RExC_parse-seqstart, seqstart);
9819                 /*NOTREACHED*/
9820             case '<':           /* (?<...) */
9821                 if (*RExC_parse == '!')
9822                     paren = ',';
9823                 else if (*RExC_parse != '=')
9824               named_capture:
9825                 {               /* (?<...>) */
9826                     char *name_start;
9827                     SV *svname;
9828                     paren= '>';
9829             case '\'':          /* (?'...') */
9830                     name_start= RExC_parse;
9831                     svname = reg_scan_name(pRExC_state,
9832                         SIZE_ONLY    /* reverse test from the others */
9833                         ? REG_RSN_RETURN_NAME
9834                         : REG_RSN_RETURN_NULL);
9835                     if (RExC_parse == name_start || *RExC_parse != paren)
9836                         vFAIL2("Sequence (?%c... not terminated",
9837                             paren=='>' ? '<' : paren);
9838                     if (SIZE_ONLY) {
9839                         HE *he_str;
9840                         SV *sv_dat = NULL;
9841                         if (!svname) /* shouldn't happen */
9842                             Perl_croak(aTHX_
9843                                 "panic: reg_scan_name returned NULL");
9844                         if (!RExC_paren_names) {
9845                             RExC_paren_names= newHV();
9846                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9847 #ifdef DEBUGGING
9848                             RExC_paren_name_list= newAV();
9849                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9850 #endif
9851                         }
9852                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9853                         if ( he_str )
9854                             sv_dat = HeVAL(he_str);
9855                         if ( ! sv_dat ) {
9856                             /* croak baby croak */
9857                             Perl_croak(aTHX_
9858                                 "panic: paren_name hash element allocation failed");
9859                         } else if ( SvPOK(sv_dat) ) {
9860                             /* (?|...) can mean we have dupes so scan to check
9861                                its already been stored. Maybe a flag indicating
9862                                we are inside such a construct would be useful,
9863                                but the arrays are likely to be quite small, so
9864                                for now we punt -- dmq */
9865                             IV count = SvIV(sv_dat);
9866                             I32 *pv = (I32*)SvPVX(sv_dat);
9867                             IV i;
9868                             for ( i = 0 ; i < count ; i++ ) {
9869                                 if ( pv[i] == RExC_npar ) {
9870                                     count = 0;
9871                                     break;
9872                                 }
9873                             }
9874                             if ( count ) {
9875                                 pv = (I32*)SvGROW(sv_dat,
9876                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9877                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9878                                 pv[count] = RExC_npar;
9879                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9880                             }
9881                         } else {
9882                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9883                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9884                                                                 sizeof(I32));
9885                             SvIOK_on(sv_dat);
9886                             SvIV_set(sv_dat, 1);
9887                         }
9888 #ifdef DEBUGGING
9889                         /* Yes this does cause a memory leak in debugging Perls
9890                          * */
9891                         if (!av_store(RExC_paren_name_list,
9892                                       RExC_npar, SvREFCNT_inc(svname)))
9893                             SvREFCNT_dec_NN(svname);
9894 #endif
9895
9896                         /*sv_dump(sv_dat);*/
9897                     }
9898                     nextchar(pRExC_state);
9899                     paren = 1;
9900                     goto capturing_parens;
9901                 }
9902                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9903                 RExC_in_lookbehind++;
9904                 RExC_parse++;
9905                 /* FALLTHROUGH */
9906             case '=':           /* (?=...) */
9907                 RExC_seen_zerolen++;
9908                 break;
9909             case '!':           /* (?!...) */
9910                 RExC_seen_zerolen++;
9911                 if (*RExC_parse == ')') {
9912                     ret=reg_node(pRExC_state, OPFAIL);
9913                     nextchar(pRExC_state);
9914                     return ret;
9915                 }
9916                 break;
9917             case '|':           /* (?|...) */
9918                 /* branch reset, behave like a (?:...) except that
9919                    buffers in alternations share the same numbers */
9920                 paren = ':';
9921                 after_freeze = freeze_paren = RExC_npar;
9922                 break;
9923             case ':':           /* (?:...) */
9924             case '>':           /* (?>...) */
9925                 break;
9926             case '$':           /* (?$...) */
9927             case '@':           /* (?@...) */
9928                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9929                 break;
9930             case '0' :           /* (?0) */
9931             case 'R' :           /* (?R) */
9932                 if (*RExC_parse != ')')
9933                     FAIL("Sequence (?R) not terminated");
9934                 ret = reg_node(pRExC_state, GOSTART);
9935                     RExC_seen |= REG_GOSTART_SEEN;
9936                 *flagp |= POSTPONED;
9937                 nextchar(pRExC_state);
9938                 return ret;
9939                 /*notreached*/
9940             /* named and numeric backreferences */
9941             case '&':            /* (?&NAME) */
9942                 parse_start = RExC_parse - 1;
9943               named_recursion:
9944                 {
9945                     SV *sv_dat = reg_scan_name(pRExC_state,
9946                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9947                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9948                 }
9949                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9950                     vFAIL("Sequence (?&... not terminated");
9951                 goto gen_recurse_regop;
9952                 assert(0); /* NOT REACHED */
9953             case '+':
9954                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9955                     RExC_parse++;
9956                     vFAIL("Illegal pattern");
9957                 }
9958                 goto parse_recursion;
9959                 /* NOT REACHED*/
9960             case '-': /* (?-1) */
9961                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9962                     RExC_parse--; /* rewind to let it be handled later */
9963                     goto parse_flags;
9964                 }
9965                 /* FALLTHROUGH */
9966             case '1': case '2': case '3': case '4': /* (?1) */
9967             case '5': case '6': case '7': case '8': case '9':
9968                 RExC_parse--;
9969               parse_recursion:
9970                 {
9971                     bool is_neg = FALSE;
9972                     parse_start = RExC_parse - 1; /* MJD */
9973                     if (*RExC_parse == '-') {
9974                         RExC_parse++;
9975                         is_neg = TRUE;
9976                     }
9977                     num = grok_atou(RExC_parse, &endptr);
9978                     if (endptr)
9979                         RExC_parse = (char*)endptr;
9980                     if (is_neg) {
9981                         /* Some limit for num? */
9982                         num = -num;
9983                     }
9984                 }
9985                 if (*RExC_parse!=')')
9986                     vFAIL("Expecting close bracket");
9987
9988               gen_recurse_regop:
9989                 if ( paren == '-' ) {
9990                     /*
9991                     Diagram of capture buffer numbering.
9992                     Top line is the normal capture buffer numbers
9993                     Bottom line is the negative indexing as from
9994                     the X (the (?-2))
9995
9996                     +   1 2    3 4 5 X          6 7
9997                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9998                     -   5 4    3 2 1 X          x x
9999
10000                     */
10001                     num = RExC_npar + num;
10002                     if (num < 1)  {
10003                         RExC_parse++;
10004                         vFAIL("Reference to nonexistent group");
10005                     }
10006                 } else if ( paren == '+' ) {
10007                     num = RExC_npar + num - 1;
10008                 }
10009
10010                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10011                 if (!SIZE_ONLY) {
10012                     if (num > (I32)RExC_rx->nparens) {
10013                         RExC_parse++;
10014                         vFAIL("Reference to nonexistent group");
10015                     }
10016                     RExC_recurse_count++;
10017                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10018                         "Recurse #%"UVuf" to %"IVdf"\n",
10019                               (UV)ARG(ret), (IV)ARG2L(ret)));
10020                 }
10021                 RExC_seen |= REG_RECURSE_SEEN;
10022                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10023                 Set_Node_Offset(ret, parse_start); /* MJD */
10024
10025                 *flagp |= POSTPONED;
10026                 nextchar(pRExC_state);
10027                 return ret;
10028
10029             assert(0); /* NOT REACHED */
10030
10031             case '?':           /* (??...) */
10032                 is_logical = 1;
10033                 if (*RExC_parse != '{') {
10034                     RExC_parse++;
10035                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10036                     vFAIL2utf8f(
10037                         "Sequence (%"UTF8f"...) not recognized",
10038                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10039                     /*NOTREACHED*/
10040                 }
10041                 *flagp |= POSTPONED;
10042                 paren = *RExC_parse++;
10043                 /* FALLTHROUGH */
10044             case '{':           /* (?{...}) */
10045             {
10046                 U32 n = 0;
10047                 struct reg_code_block *cb;
10048
10049                 RExC_seen_zerolen++;
10050
10051                 if (   !pRExC_state->num_code_blocks
10052                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10053                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10054                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10055                             - RExC_start)
10056                 ) {
10057                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10058                         FAIL("panic: Sequence (?{...}): no code block found\n");
10059                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10060                 }
10061                 /* this is a pre-compiled code block (?{...}) */
10062                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10063                 RExC_parse = RExC_start + cb->end;
10064                 if (!SIZE_ONLY) {
10065                     OP *o = cb->block;
10066                     if (cb->src_regex) {
10067                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10068                         RExC_rxi->data->data[n] =
10069                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10070                         RExC_rxi->data->data[n+1] = (void*)o;
10071                     }
10072                     else {
10073                         n = add_data(pRExC_state,
10074                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10075                         RExC_rxi->data->data[n] = (void*)o;
10076                     }
10077                 }
10078                 pRExC_state->code_index++;
10079                 nextchar(pRExC_state);
10080
10081                 if (is_logical) {
10082                     regnode *eval;
10083                     ret = reg_node(pRExC_state, LOGICAL);
10084
10085                     eval = reg2Lanode(pRExC_state, EVAL,
10086                                        n,
10087
10088                                        /* for later propagation into (??{})
10089                                         * return value */
10090                                        RExC_flags & RXf_PMf_COMPILETIME
10091                                       );
10092                     if (!SIZE_ONLY) {
10093                         ret->flags = 2;
10094                     }
10095                     REGTAIL(pRExC_state, ret, eval);
10096                     /* deal with the length of this later - MJD */
10097                     return ret;
10098                 }
10099                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10100                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10101                 Set_Node_Offset(ret, parse_start);
10102                 return ret;
10103             }
10104             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10105             {
10106                 int is_define= 0;
10107                 const int DEFINE_len = sizeof("DEFINE") - 1;
10108                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10109                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10110                         || RExC_parse[1] == '<'
10111                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10112                         I32 flag;
10113                         regnode *tail;
10114
10115                         ret = reg_node(pRExC_state, LOGICAL);
10116                         if (!SIZE_ONLY)
10117                             ret->flags = 1;
10118
10119                         tail = reg(pRExC_state, 1, &flag, depth+1);
10120                         if (flag & RESTART_UTF8) {
10121                             *flagp = RESTART_UTF8;
10122                             return NULL;
10123                         }
10124                         REGTAIL(pRExC_state, ret, tail);
10125                         goto insert_if;
10126                     }
10127                     /* Fall through to ‘Unknown switch condition’ at the
10128                        end of the if/else chain. */
10129                 }
10130                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10131                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10132                 {
10133                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10134                     char *name_start= RExC_parse++;
10135                     U32 num = 0;
10136                     SV *sv_dat=reg_scan_name(pRExC_state,
10137                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10138                     if (RExC_parse == name_start || *RExC_parse != ch)
10139                         vFAIL2("Sequence (?(%c... not terminated",
10140                             (ch == '>' ? '<' : ch));
10141                     RExC_parse++;
10142                     if (!SIZE_ONLY) {
10143                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10144                         RExC_rxi->data->data[num]=(void*)sv_dat;
10145                         SvREFCNT_inc_simple_void(sv_dat);
10146                     }
10147                     ret = reganode(pRExC_state,NGROUPP,num);
10148                     goto insert_if_check_paren;
10149                 }
10150                 else if (strnEQ(RExC_parse, "DEFINE",
10151                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10152                 {
10153                     ret = reganode(pRExC_state,DEFINEP,0);
10154                     RExC_parse += DEFINE_len;
10155                     is_define = 1;
10156                     goto insert_if_check_paren;
10157                 }
10158                 else if (RExC_parse[0] == 'R') {
10159                     RExC_parse++;
10160                     parno = 0;
10161                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10162                         parno = grok_atou(RExC_parse, &endptr);
10163                         if (endptr)
10164                             RExC_parse = (char*)endptr;
10165                     } else if (RExC_parse[0] == '&') {
10166                         SV *sv_dat;
10167                         RExC_parse++;
10168                         sv_dat = reg_scan_name(pRExC_state,
10169                             SIZE_ONLY
10170                             ? REG_RSN_RETURN_NULL
10171                             : REG_RSN_RETURN_DATA);
10172                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10173                     }
10174                     ret = reganode(pRExC_state,INSUBP,parno);
10175                     goto insert_if_check_paren;
10176                 }
10177                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10178                     /* (?(1)...) */
10179                     char c;
10180                     char *tmp;
10181                     parno = grok_atou(RExC_parse, &endptr);
10182                     if (endptr)
10183                         RExC_parse = (char*)endptr;
10184                     ret = reganode(pRExC_state, GROUPP, parno);
10185
10186                  insert_if_check_paren:
10187                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10188                         /* nextchar also skips comments, so undo its work
10189                          * and skip over the the next character.
10190                          */
10191                         RExC_parse = tmp;
10192                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10193                         vFAIL("Switch condition not recognized");
10194                     }
10195                   insert_if:
10196                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10197                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10198                     if (br == NULL) {
10199                         if (flags & RESTART_UTF8) {
10200                             *flagp = RESTART_UTF8;
10201                             return NULL;
10202                         }
10203                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10204                               (UV) flags);
10205                     } else
10206                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10207                                                           LONGJMP, 0));
10208                     c = *nextchar(pRExC_state);
10209                     if (flags&HASWIDTH)
10210                         *flagp |= HASWIDTH;
10211                     if (c == '|') {
10212                         if (is_define)
10213                             vFAIL("(?(DEFINE)....) does not allow branches");
10214
10215                         /* Fake one for optimizer.  */
10216                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10217
10218                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10219                             if (flags & RESTART_UTF8) {
10220                                 *flagp = RESTART_UTF8;
10221                                 return NULL;
10222                             }
10223                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10224                                   (UV) flags);
10225                         }
10226                         REGTAIL(pRExC_state, ret, lastbr);
10227                         if (flags&HASWIDTH)
10228                             *flagp |= HASWIDTH;
10229                         c = *nextchar(pRExC_state);
10230                     }
10231                     else
10232                         lastbr = NULL;
10233                     if (c != ')') {
10234                         if (RExC_parse>RExC_end)
10235                             vFAIL("Switch (?(condition)... not terminated");
10236                         else
10237                             vFAIL("Switch (?(condition)... contains too many branches");
10238                     }
10239                     ender = reg_node(pRExC_state, TAIL);
10240                     REGTAIL(pRExC_state, br, ender);
10241                     if (lastbr) {
10242                         REGTAIL(pRExC_state, lastbr, ender);
10243                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10244                     }
10245                     else
10246                         REGTAIL(pRExC_state, ret, ender);
10247                     RExC_size++; /* XXX WHY do we need this?!!
10248                                     For large programs it seems to be required
10249                                     but I can't figure out why. -- dmq*/
10250                     return ret;
10251                 }
10252                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10253                 vFAIL("Unknown switch condition (?(...))");
10254             }
10255             case '[':           /* (?[ ... ]) */
10256                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10257                                          oregcomp_parse);
10258             case 0:
10259                 RExC_parse--; /* for vFAIL to print correctly */
10260                 vFAIL("Sequence (? incomplete");
10261                 break;
10262             default: /* e.g., (?i) */
10263                 --RExC_parse;
10264               parse_flags:
10265                 parse_lparen_question_flags(pRExC_state);
10266                 if (UCHARAT(RExC_parse) != ':') {
10267                     nextchar(pRExC_state);
10268                     *flagp = TRYAGAIN;
10269                     return NULL;
10270                 }
10271                 paren = ':';
10272                 nextchar(pRExC_state);
10273                 ret = NULL;
10274                 goto parse_rest;
10275             } /* end switch */
10276         }
10277         else {                  /* (...) */
10278           capturing_parens:
10279             parno = RExC_npar;
10280             RExC_npar++;
10281
10282             ret = reganode(pRExC_state, OPEN, parno);
10283             if (!SIZE_ONLY ){
10284                 if (!RExC_nestroot)
10285                     RExC_nestroot = parno;
10286                 if (RExC_seen & REG_RECURSE_SEEN
10287                     && !RExC_open_parens[parno-1])
10288                 {
10289                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10290                         "Setting open paren #%"IVdf" to %d\n",
10291                         (IV)parno, REG_NODE_NUM(ret)));
10292                     RExC_open_parens[parno-1]= ret;
10293                 }
10294             }
10295             Set_Node_Length(ret, 1); /* MJD */
10296             Set_Node_Offset(ret, RExC_parse); /* MJD */
10297             is_open = 1;
10298         }
10299     }
10300     else                        /* ! paren */
10301         ret = NULL;
10302
10303    parse_rest:
10304     /* Pick up the branches, linking them together. */
10305     parse_start = RExC_parse;   /* MJD */
10306     br = regbranch(pRExC_state, &flags, 1,depth+1);
10307
10308     /*     branch_len = (paren != 0); */
10309
10310     if (br == NULL) {
10311         if (flags & RESTART_UTF8) {
10312             *flagp = RESTART_UTF8;
10313             return NULL;
10314         }
10315         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10316     }
10317     if (*RExC_parse == '|') {
10318         if (!SIZE_ONLY && RExC_extralen) {
10319             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10320         }
10321         else {                  /* MJD */
10322             reginsert(pRExC_state, BRANCH, br, depth+1);
10323             Set_Node_Length(br, paren != 0);
10324             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10325         }
10326         have_branch = 1;
10327         if (SIZE_ONLY)
10328             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10329     }
10330     else if (paren == ':') {
10331         *flagp |= flags&SIMPLE;
10332     }
10333     if (is_open) {                              /* Starts with OPEN. */
10334         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10335     }
10336     else if (paren != '?')              /* Not Conditional */
10337         ret = br;
10338     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10339     lastbr = br;
10340     while (*RExC_parse == '|') {
10341         if (!SIZE_ONLY && RExC_extralen) {
10342             ender = reganode(pRExC_state, LONGJMP,0);
10343
10344             /* Append to the previous. */
10345             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10346         }
10347         if (SIZE_ONLY)
10348             RExC_extralen += 2;         /* Account for LONGJMP. */
10349         nextchar(pRExC_state);
10350         if (freeze_paren) {
10351             if (RExC_npar > after_freeze)
10352                 after_freeze = RExC_npar;
10353             RExC_npar = freeze_paren;
10354         }
10355         br = regbranch(pRExC_state, &flags, 0, depth+1);
10356
10357         if (br == NULL) {
10358             if (flags & RESTART_UTF8) {
10359                 *flagp = RESTART_UTF8;
10360                 return NULL;
10361             }
10362             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10363         }
10364         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10365         lastbr = br;
10366         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10367     }
10368
10369     if (have_branch || paren != ':') {
10370         /* Make a closing node, and hook it on the end. */
10371         switch (paren) {
10372         case ':':
10373             ender = reg_node(pRExC_state, TAIL);
10374             break;
10375         case 1: case 2:
10376             ender = reganode(pRExC_state, CLOSE, parno);
10377             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10378                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10379                         "Setting close paren #%"IVdf" to %d\n",
10380                         (IV)parno, REG_NODE_NUM(ender)));
10381                 RExC_close_parens[parno-1]= ender;
10382                 if (RExC_nestroot == parno)
10383                     RExC_nestroot = 0;
10384             }
10385             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10386             Set_Node_Length(ender,1); /* MJD */
10387             break;
10388         case '<':
10389         case ',':
10390         case '=':
10391         case '!':
10392             *flagp &= ~HASWIDTH;
10393             /* FALLTHROUGH */
10394         case '>':
10395             ender = reg_node(pRExC_state, SUCCEED);
10396             break;
10397         case 0:
10398             ender = reg_node(pRExC_state, END);
10399             if (!SIZE_ONLY) {
10400                 assert(!RExC_opend); /* there can only be one! */
10401                 RExC_opend = ender;
10402             }
10403             break;
10404         }
10405         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10406             SV * const mysv_val1=sv_newmortal();
10407             SV * const mysv_val2=sv_newmortal();
10408             DEBUG_PARSE_MSG("lsbr");
10409             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10410             regprop(RExC_rx, mysv_val2, ender, NULL);
10411             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10412                           SvPV_nolen_const(mysv_val1),
10413                           (IV)REG_NODE_NUM(lastbr),
10414                           SvPV_nolen_const(mysv_val2),
10415                           (IV)REG_NODE_NUM(ender),
10416                           (IV)(ender - lastbr)
10417             );
10418         });
10419         REGTAIL(pRExC_state, lastbr, ender);
10420
10421         if (have_branch && !SIZE_ONLY) {
10422             char is_nothing= 1;
10423             if (depth==1)
10424                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10425
10426             /* Hook the tails of the branches to the closing node. */
10427             for (br = ret; br; br = regnext(br)) {
10428                 const U8 op = PL_regkind[OP(br)];
10429                 if (op == BRANCH) {
10430                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10431                     if ( OP(NEXTOPER(br)) != NOTHING
10432                          || regnext(NEXTOPER(br)) != ender)
10433                         is_nothing= 0;
10434                 }
10435                 else if (op == BRANCHJ) {
10436                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10437                     /* for now we always disable this optimisation * /
10438                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10439                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10440                     */
10441                         is_nothing= 0;
10442                 }
10443             }
10444             if (is_nothing) {
10445                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10446                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10447                     SV * const mysv_val1=sv_newmortal();
10448                     SV * const mysv_val2=sv_newmortal();
10449                     DEBUG_PARSE_MSG("NADA");
10450                     regprop(RExC_rx, mysv_val1, ret, NULL);
10451                     regprop(RExC_rx, mysv_val2, ender, NULL);
10452                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10453                                   SvPV_nolen_const(mysv_val1),
10454                                   (IV)REG_NODE_NUM(ret),
10455                                   SvPV_nolen_const(mysv_val2),
10456                                   (IV)REG_NODE_NUM(ender),
10457                                   (IV)(ender - ret)
10458                     );
10459                 });
10460                 OP(br)= NOTHING;
10461                 if (OP(ender) == TAIL) {
10462                     NEXT_OFF(br)= 0;
10463                     RExC_emit= br + 1;
10464                 } else {
10465                     regnode *opt;
10466                     for ( opt= br + 1; opt < ender ; opt++ )
10467                         OP(opt)= OPTIMIZED;
10468                     NEXT_OFF(br)= ender - br;
10469                 }
10470             }
10471         }
10472     }
10473
10474     {
10475         const char *p;
10476         static const char parens[] = "=!<,>";
10477
10478         if (paren && (p = strchr(parens, paren))) {
10479             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10480             int flag = (p - parens) > 1;
10481
10482             if (paren == '>')
10483                 node = SUSPEND, flag = 0;
10484             reginsert(pRExC_state, node,ret, depth+1);
10485             Set_Node_Cur_Length(ret, parse_start);
10486             Set_Node_Offset(ret, parse_start + 1);
10487             ret->flags = flag;
10488             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10489         }
10490     }
10491
10492     /* Check for proper termination. */
10493     if (paren) {
10494         /* restore original flags, but keep (?p) */
10495         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10496         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10497             RExC_parse = oregcomp_parse;
10498             vFAIL("Unmatched (");
10499         }
10500     }
10501     else if (!paren && RExC_parse < RExC_end) {
10502         if (*RExC_parse == ')') {
10503             RExC_parse++;
10504             vFAIL("Unmatched )");
10505         }
10506         else
10507             FAIL("Junk on end of regexp");      /* "Can't happen". */
10508         assert(0); /* NOTREACHED */
10509     }
10510
10511     if (RExC_in_lookbehind) {
10512         RExC_in_lookbehind--;
10513     }
10514     if (after_freeze > RExC_npar)
10515         RExC_npar = after_freeze;
10516     return(ret);
10517 }
10518
10519 /*
10520  - regbranch - one alternative of an | operator
10521  *
10522  * Implements the concatenation operator.
10523  *
10524  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10525  * restarted.
10526  */
10527 STATIC regnode *
10528 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10529 {
10530     regnode *ret;
10531     regnode *chain = NULL;
10532     regnode *latest;
10533     I32 flags = 0, c = 0;
10534     GET_RE_DEBUG_FLAGS_DECL;
10535
10536     PERL_ARGS_ASSERT_REGBRANCH;
10537
10538     DEBUG_PARSE("brnc");
10539
10540     if (first)
10541         ret = NULL;
10542     else {
10543         if (!SIZE_ONLY && RExC_extralen)
10544             ret = reganode(pRExC_state, BRANCHJ,0);
10545         else {
10546             ret = reg_node(pRExC_state, BRANCH);
10547             Set_Node_Length(ret, 1);
10548         }
10549     }
10550
10551     if (!first && SIZE_ONLY)
10552         RExC_extralen += 1;                     /* BRANCHJ */
10553
10554     *flagp = WORST;                     /* Tentatively. */
10555
10556     RExC_parse--;
10557     nextchar(pRExC_state);
10558     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10559         flags &= ~TRYAGAIN;
10560         latest = regpiece(pRExC_state, &flags,depth+1);
10561         if (latest == NULL) {
10562             if (flags & TRYAGAIN)
10563                 continue;
10564             if (flags & RESTART_UTF8) {
10565                 *flagp = RESTART_UTF8;
10566                 return NULL;
10567             }
10568             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10569         }
10570         else if (ret == NULL)
10571             ret = latest;
10572         *flagp |= flags&(HASWIDTH|POSTPONED);
10573         if (chain == NULL)      /* First piece. */
10574             *flagp |= flags&SPSTART;
10575         else {
10576             RExC_naughty++;
10577             REGTAIL(pRExC_state, chain, latest);
10578         }
10579         chain = latest;
10580         c++;
10581     }
10582     if (chain == NULL) {        /* Loop ran zero times. */
10583         chain = reg_node(pRExC_state, NOTHING);
10584         if (ret == NULL)
10585             ret = chain;
10586     }
10587     if (c == 1) {
10588         *flagp |= flags&SIMPLE;
10589     }
10590
10591     return ret;
10592 }
10593
10594 /*
10595  - regpiece - something followed by possible [*+?]
10596  *
10597  * Note that the branching code sequences used for ? and the general cases
10598  * of * and + are somewhat optimized:  they use the same NOTHING node as
10599  * both the endmarker for their branch list and the body of the last branch.
10600  * It might seem that this node could be dispensed with entirely, but the
10601  * endmarker role is not redundant.
10602  *
10603  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10604  * TRYAGAIN.
10605  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10606  * restarted.
10607  */
10608 STATIC regnode *
10609 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10610 {
10611     regnode *ret;
10612     char op;
10613     char *next;
10614     I32 flags;
10615     const char * const origparse = RExC_parse;
10616     I32 min;
10617     I32 max = REG_INFTY;
10618 #ifdef RE_TRACK_PATTERN_OFFSETS
10619     char *parse_start;
10620 #endif
10621     const char *maxpos = NULL;
10622
10623     /* Save the original in case we change the emitted regop to a FAIL. */
10624     regnode * const orig_emit = RExC_emit;
10625
10626     GET_RE_DEBUG_FLAGS_DECL;
10627
10628     PERL_ARGS_ASSERT_REGPIECE;
10629
10630     DEBUG_PARSE("piec");
10631
10632     ret = regatom(pRExC_state, &flags,depth+1);
10633     if (ret == NULL) {
10634         if (flags & (TRYAGAIN|RESTART_UTF8))
10635             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10636         else
10637             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10638         return(NULL);
10639     }
10640
10641     op = *RExC_parse;
10642
10643     if (op == '{' && regcurly(RExC_parse)) {
10644         maxpos = NULL;
10645 #ifdef RE_TRACK_PATTERN_OFFSETS
10646         parse_start = RExC_parse; /* MJD */
10647 #endif
10648         next = RExC_parse + 1;
10649         while (isDIGIT(*next) || *next == ',') {
10650             if (*next == ',') {
10651                 if (maxpos)
10652                     break;
10653                 else
10654                     maxpos = next;
10655             }
10656             next++;
10657         }
10658         if (*next == '}') {             /* got one */
10659             const char* endptr;
10660             if (!maxpos)
10661                 maxpos = next;
10662             RExC_parse++;
10663             min = grok_atou(RExC_parse, &endptr);
10664             if (*maxpos == ',')
10665                 maxpos++;
10666             else
10667                 maxpos = RExC_parse;
10668             max = grok_atou(maxpos, &endptr);
10669             if (!max && *maxpos != '0')
10670                 max = REG_INFTY;                /* meaning "infinity" */
10671             else if (max >= REG_INFTY)
10672                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10673             RExC_parse = next;
10674             nextchar(pRExC_state);
10675             if (max < min) {    /* If can't match, warn and optimize to fail
10676                                    unconditionally */
10677                 if (SIZE_ONLY) {
10678
10679                     /* We can't back off the size because we have to reserve
10680                      * enough space for all the things we are about to throw
10681                      * away, but we can shrink it by the ammount we are about
10682                      * to re-use here */
10683                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10684                 }
10685                 else {
10686                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10687                     RExC_emit = orig_emit;
10688                 }
10689                 ret = reg_node(pRExC_state, OPFAIL);
10690                 return ret;
10691             }
10692             else if (min == max
10693                      && RExC_parse < RExC_end
10694                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10695             {
10696                 if (PASS2) {
10697                     ckWARN2reg(RExC_parse + 1,
10698                                "Useless use of greediness modifier '%c'",
10699                                *RExC_parse);
10700                 }
10701                 /* Absorb the modifier, so later code doesn't see nor use
10702                     * it */
10703                 nextchar(pRExC_state);
10704             }
10705
10706         do_curly:
10707             if ((flags&SIMPLE)) {
10708                 RExC_naughty += 2 + RExC_naughty / 2;
10709                 reginsert(pRExC_state, CURLY, ret, depth+1);
10710                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10711                 Set_Node_Cur_Length(ret, parse_start);
10712             }
10713             else {
10714                 regnode * const w = reg_node(pRExC_state, WHILEM);
10715
10716                 w->flags = 0;
10717                 REGTAIL(pRExC_state, ret, w);
10718                 if (!SIZE_ONLY && RExC_extralen) {
10719                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10720                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10721                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10722                 }
10723                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10724                                 /* MJD hk */
10725                 Set_Node_Offset(ret, parse_start+1);
10726                 Set_Node_Length(ret,
10727                                 op == '{' ? (RExC_parse - parse_start) : 1);
10728
10729                 if (!SIZE_ONLY && RExC_extralen)
10730                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10731                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10732                 if (SIZE_ONLY)
10733                     RExC_whilem_seen++, RExC_extralen += 3;
10734                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10735             }
10736             ret->flags = 0;
10737
10738             if (min > 0)
10739                 *flagp = WORST;
10740             if (max > 0)
10741                 *flagp |= HASWIDTH;
10742             if (!SIZE_ONLY) {
10743                 ARG1_SET(ret, (U16)min);
10744                 ARG2_SET(ret, (U16)max);
10745             }
10746             if (max == REG_INFTY)
10747                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10748
10749             goto nest_check;
10750         }
10751     }
10752
10753     if (!ISMULT1(op)) {
10754         *flagp = flags;
10755         return(ret);
10756     }
10757
10758 #if 0                           /* Now runtime fix should be reliable. */
10759
10760     /* if this is reinstated, don't forget to put this back into perldiag:
10761
10762             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10763
10764            (F) The part of the regexp subject to either the * or + quantifier
10765            could match an empty string. The {#} shows in the regular
10766            expression about where the problem was discovered.
10767
10768     */
10769
10770     if (!(flags&HASWIDTH) && op != '?')
10771       vFAIL("Regexp *+ operand could be empty");
10772 #endif
10773
10774 #ifdef RE_TRACK_PATTERN_OFFSETS
10775     parse_start = RExC_parse;
10776 #endif
10777     nextchar(pRExC_state);
10778
10779     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10780
10781     if (op == '*' && (flags&SIMPLE)) {
10782         reginsert(pRExC_state, STAR, ret, depth+1);
10783         ret->flags = 0;
10784         RExC_naughty += 4;
10785         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10786     }
10787     else if (op == '*') {
10788         min = 0;
10789         goto do_curly;
10790     }
10791     else if (op == '+' && (flags&SIMPLE)) {
10792         reginsert(pRExC_state, PLUS, ret, depth+1);
10793         ret->flags = 0;
10794         RExC_naughty += 3;
10795         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10796     }
10797     else if (op == '+') {
10798         min = 1;
10799         goto do_curly;
10800     }
10801     else if (op == '?') {
10802         min = 0; max = 1;
10803         goto do_curly;
10804     }
10805   nest_check:
10806     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10807         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10808         ckWARN2reg(RExC_parse,
10809                    "%"UTF8f" matches null string many times",
10810                    UTF8fARG(UTF, (RExC_parse >= origparse
10811                                  ? RExC_parse - origparse
10812                                  : 0),
10813                    origparse));
10814         (void)ReREFCNT_inc(RExC_rx_sv);
10815     }
10816
10817     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10818         nextchar(pRExC_state);
10819         reginsert(pRExC_state, MINMOD, ret, depth+1);
10820         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10821     }
10822     else
10823     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10824         regnode *ender;
10825         nextchar(pRExC_state);
10826         ender = reg_node(pRExC_state, SUCCEED);
10827         REGTAIL(pRExC_state, ret, ender);
10828         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10829         ret->flags = 0;
10830         ender = reg_node(pRExC_state, TAIL);
10831         REGTAIL(pRExC_state, ret, ender);
10832     }
10833
10834     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10835         RExC_parse++;
10836         vFAIL("Nested quantifiers");
10837     }
10838
10839     return(ret);
10840 }
10841
10842 STATIC STRLEN
10843 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10844                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10845     )
10846 {
10847
10848  /* This is expected to be called by a parser routine that has recognized '\N'
10849    and needs to handle the rest. RExC_parse is expected to point at the first
10850    char following the N at the time of the call.  On successful return,
10851    RExC_parse has been updated to point to just after the sequence identified
10852    by this routine, <*flagp> has been updated, and the non-NULL input pointers
10853    have been set appropriately.
10854
10855    The typical case for this is \N{some character name}.  This is usually
10856    called while parsing the input, filling in or ready to fill in an EXACTish
10857    node, and the code point for the character should be returned, so that it
10858    can be added to the node, and parsing continued with the next input
10859    character.  But it may be that instead of a single character the \N{}
10860    expands to more than one, a named sequence.  In this case any following
10861    quantifier applies to the whole sequence, and it is easier, given the code
10862    structure that calls this, to handle it from a different area of the code.
10863    For this reason, the input parameters can be set so that it returns valid
10864    only on one or the other of these cases.
10865
10866    Another possibility is for the input to be an empty \N{}, which for
10867    backwards compatibility we accept, but generate a NOTHING node which should
10868    later get optimized out.  This is handled from the area of code which can
10869    handle a named sequence, so if called with the parameters for the other, it
10870    fails.
10871
10872    Still another possibility is for the \N to mean [^\n], and not a single
10873    character or explicit sequence at all.  This is determined by context.
10874    Again, this is handled from the area of code which can handle a named
10875    sequence, so if called with the parameters for the other, it also fails.
10876
10877    And the final possibility is for the \N to be called from within a bracketed
10878    character class.  In this case the [^\n] meaning makes no sense, and so is
10879    an error.  Other anomalous situations are left to the calling code to handle.
10880
10881    For non-single-quoted regexes, the tokenizer has attempted to decide which
10882    of the above applies, and in the case of a named sequence, has converted it
10883    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10884    where c1... are the characters in the sequence.  For single-quoted regexes,
10885    the tokenizer passes the \N sequence through unchanged; this code will not
10886    attempt to determine this nor expand those, instead raising a syntax error.
10887    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10888    or there is no '}', it signals that this \N occurrence means to match a
10889    non-newline. (This mostly was done because of [perl #56444].)
10890
10891    The API is somewhat convoluted due to historical and the above reasons.
10892
10893    The function raises an error (via vFAIL), and doesn't return for various
10894    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
10895    it returns a count of how many characters were accounted for by it.  (This
10896    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
10897    points in the sequence.  It sets <node_p>, <valuep>, and/or
10898    <substitute_parse> on success.
10899
10900    If <valuep> is non-null, it means the caller can accept an input sequence
10901    consisting of a just a single code point; <*valuep> is set to the value
10902    of the only or first code point in the input.
10903
10904    If <substitute_parse> is non-null, it means the caller can accept an input
10905    sequence consisting of one or more code points; <*substitute_parse> is a
10906    newly created mortal SV* in this case, containing \x{} escapes representing
10907    those code points.
10908
10909    Both <valuep> and <substitute_parse> can be non-NULL.
10910
10911    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
10912    that the caller can accept any legal sequence other than a single code
10913    point.  To wit, <*node_p> is set as follows:
10914     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
10915     2) \N{}:              points to a new NOTHING node; return is 0
10916     3) otherwise:         points to a new EXACT node containing the resolved
10917                           string; return is the number of code points in the
10918                           string.  This will never be 1.
10919    Note that failure is returned for single code point sequences if <valuep> is
10920    null and <node_p> is not.
10921  */
10922
10923     char * endbrace;    /* '}' following the name */
10924     char* p;
10925     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10926                            stream */
10927     bool has_multiple_chars; /* true if the input stream contains a sequence of
10928                                 more than one character */
10929     bool in_char_class = substitute_parse != NULL;
10930     STRLEN count = 0;   /* Number of characters in this sequence */
10931
10932     GET_RE_DEBUG_FLAGS_DECL;
10933
10934     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10935
10936     GET_RE_DEBUG_FLAGS;
10937
10938     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10939     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
10940
10941     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10942      * modifier.  The other meaning does not, so use a temporary until we find
10943      * out which we are being called with */
10944     p = (RExC_flags & RXf_PMf_EXTENDED)
10945         ? regpatws(pRExC_state, RExC_parse,
10946                                 TRUE) /* means recognize comments */
10947         : RExC_parse;
10948
10949     /* Disambiguate between \N meaning a named character versus \N meaning
10950      * [^\n].  The former is assumed when it can't be the latter. */
10951     if (*p != '{' || regcurly(p)) {
10952         RExC_parse = p;
10953         if (! node_p) {
10954             /* no bare \N allowed in a charclass */
10955             if (in_char_class) {
10956                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10957             }
10958             return (STRLEN) -1;
10959         }
10960         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10961                            current char */
10962         nextchar(pRExC_state);
10963         *node_p = reg_node(pRExC_state, REG_ANY);
10964         *flagp |= HASWIDTH|SIMPLE;
10965         RExC_naughty++;
10966         Set_Node_Length(*node_p, 1); /* MJD */
10967         return 1;
10968     }
10969
10970     /* Here, we have decided it should be a named character or sequence */
10971
10972     /* The test above made sure that the next real character is a '{', but
10973      * under the /x modifier, it could be separated by space (or a comment and
10974      * \n) and this is not allowed (for consistency with \x{...} and the
10975      * tokenizer handling of \N{NAME}). */
10976     if (*RExC_parse != '{') {
10977         vFAIL("Missing braces on \\N{}");
10978     }
10979
10980     RExC_parse++;       /* Skip past the '{' */
10981
10982     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10983         || ! (endbrace == RExC_parse            /* nothing between the {} */
10984               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10985                                                  */
10986                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10987                                                      */
10988     {
10989         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10990         vFAIL("\\N{NAME} must be resolved by the lexer");
10991     }
10992
10993     if (endbrace == RExC_parse) {   /* empty: \N{} */
10994         if (node_p) {
10995             *node_p = reg_node(pRExC_state,NOTHING);
10996         }
10997         else if (! in_char_class) {
10998             return (STRLEN) -1;
10999         }
11000         nextchar(pRExC_state);
11001         return 0;
11002     }
11003
11004     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11005     RExC_parse += 2;    /* Skip past the 'U+' */
11006
11007     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11008
11009     /* Code points are separated by dots.  If none, there is only one code
11010      * point, and is terminated by the brace */
11011     has_multiple_chars = (endchar < endbrace);
11012
11013     /* We get the first code point if we want it, and either there is only one,
11014      * or we can accept both cases of one and more than one */
11015     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11016         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11017         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11018                            | PERL_SCAN_DISALLOW_PREFIX
11019
11020                              /* No errors in the first pass (See [perl
11021                               * #122671].)  We let the code below find the
11022                               * errors when there are multiple chars. */
11023                            | ((SIZE_ONLY || has_multiple_chars)
11024                               ? PERL_SCAN_SILENT_ILLDIGIT
11025                               : 0);
11026
11027         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11028
11029         /* The tokenizer should have guaranteed validity, but it's possible to
11030          * bypass it by using single quoting, so check.  Don't do the check
11031          * here when there are multiple chars; we do it below anyway. */
11032         if (! has_multiple_chars) {
11033             if (length_of_hex == 0
11034                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11035             {
11036                 RExC_parse += length_of_hex;    /* Includes all the valid */
11037                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11038                                 ? UTF8SKIP(RExC_parse)
11039                                 : 1;
11040                 /* Guard against malformed utf8 */
11041                 if (RExC_parse >= endchar) {
11042                     RExC_parse = endchar;
11043                 }
11044                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11045             }
11046
11047             RExC_parse = endbrace + 1;
11048             return 1;
11049         }
11050     }
11051
11052     /* Here, we should have already handled the case where a single character
11053      * is expected and found.  So it is a failure if we aren't expecting
11054      * multiple chars and got them; or didn't get them but wanted them.  We
11055      * fail without advancing the parse, so that the caller can try again with
11056      * different acceptance criteria */
11057     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11058         RExC_parse = p;
11059         return (STRLEN) -1;
11060     }
11061
11062     {
11063
11064         /* What is done here is to convert this to a sub-pattern of the form
11065          * \x{char1}\x{char2}...
11066          * and then either return it in <*substitute_parse> if non-null; or
11067          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11068          * way, it retains its atomicness, while not having to worry about
11069          * special handling that some code points may have.  toke.c has
11070          * converted the original Unicode values to native, so that we can just
11071          * pass on the hex values unchanged.  We do have to set a flag to keep
11072          * recoding from happening in the recursion */
11073
11074         SV * dummy = NULL;
11075         STRLEN len;
11076         char *orig_end = RExC_end;
11077         I32 flags;
11078
11079         if (substitute_parse) {
11080             *substitute_parse = newSVpvs("");
11081         }
11082         else {
11083             substitute_parse = &dummy;
11084             *substitute_parse = newSVpvs("?:");
11085         }
11086         *substitute_parse = sv_2mortal(*substitute_parse);
11087
11088         while (RExC_parse < endbrace) {
11089
11090             /* Convert to notation the rest of the code understands */
11091             sv_catpv(*substitute_parse, "\\x{");
11092             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11093             sv_catpv(*substitute_parse, "}");
11094
11095             /* Point to the beginning of the next character in the sequence. */
11096             RExC_parse = endchar + 1;
11097             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11098
11099             count++;
11100         }
11101         if (! in_char_class) {
11102             sv_catpv(*substitute_parse, ")");
11103         }
11104
11105         RExC_parse = SvPV(*substitute_parse, len);
11106
11107         /* Don't allow empty number */
11108         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11109             RExC_parse = endbrace;
11110             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11111         }
11112         RExC_end = RExC_parse + len;
11113
11114         /* The values are Unicode, and therefore not subject to recoding */
11115         RExC_override_recoding = 1;
11116
11117         if (node_p) {
11118             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11119                 if (flags & RESTART_UTF8) {
11120                     *flagp = RESTART_UTF8;
11121                     return (STRLEN) -1;
11122                 }
11123                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11124                     (UV) flags);
11125             }
11126             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11127         }
11128
11129         RExC_parse = endbrace;
11130         RExC_end = orig_end;
11131         RExC_override_recoding = 0;
11132
11133         nextchar(pRExC_state);
11134     }
11135
11136     return count;
11137 }
11138
11139
11140 /*
11141  * reg_recode
11142  *
11143  * It returns the code point in utf8 for the value in *encp.
11144  *    value: a code value in the source encoding
11145  *    encp:  a pointer to an Encode object
11146  *
11147  * If the result from Encode is not a single character,
11148  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11149  */
11150 STATIC UV
11151 S_reg_recode(pTHX_ const char value, SV **encp)
11152 {
11153     STRLEN numlen = 1;
11154     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11155     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11156     const STRLEN newlen = SvCUR(sv);
11157     UV uv = UNICODE_REPLACEMENT;
11158
11159     PERL_ARGS_ASSERT_REG_RECODE;
11160
11161     if (newlen)
11162         uv = SvUTF8(sv)
11163              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11164              : *(U8*)s;
11165
11166     if (!newlen || numlen != newlen) {
11167         uv = UNICODE_REPLACEMENT;
11168         *encp = NULL;
11169     }
11170     return uv;
11171 }
11172
11173 PERL_STATIC_INLINE U8
11174 S_compute_EXACTish(RExC_state_t *pRExC_state)
11175 {
11176     U8 op;
11177
11178     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11179
11180     if (! FOLD) {
11181         return EXACT;
11182     }
11183
11184     op = get_regex_charset(RExC_flags);
11185     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11186         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11187                  been, so there is no hole */
11188     }
11189
11190     return op + EXACTF;
11191 }
11192
11193 PERL_STATIC_INLINE void
11194 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11195                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11196                          bool downgradable)
11197 {
11198     /* This knows the details about sizing an EXACTish node, setting flags for
11199      * it (by setting <*flagp>, and potentially populating it with a single
11200      * character.
11201      *
11202      * If <len> (the length in bytes) is non-zero, this function assumes that
11203      * the node has already been populated, and just does the sizing.  In this
11204      * case <code_point> should be the final code point that has already been
11205      * placed into the node.  This value will be ignored except that under some
11206      * circumstances <*flagp> is set based on it.
11207      *
11208      * If <len> is zero, the function assumes that the node is to contain only
11209      * the single character given by <code_point> and calculates what <len>
11210      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11211      * additionally will populate the node's STRING with <code_point> or its
11212      * fold if folding.
11213      *
11214      * In both cases <*flagp> is appropriately set
11215      *
11216      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11217      * 255, must be folded (the former only when the rules indicate it can
11218      * match 'ss')
11219      *
11220      * When it does the populating, it looks at the flag 'downgradable'.  If
11221      * true with a node that folds, it checks if the single code point
11222      * participates in a fold, and if not downgrades the node to an EXACT.
11223      * This helps the optimizer */
11224
11225     bool len_passed_in = cBOOL(len != 0);
11226     U8 character[UTF8_MAXBYTES_CASE+1];
11227
11228     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11229
11230     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11231      * sizing difference, and is extra work that is thrown away */
11232     if (downgradable && ! PASS2) {
11233         downgradable = FALSE;
11234     }
11235
11236     if (! len_passed_in) {
11237         if (UTF) {
11238             if (UNI_IS_INVARIANT(code_point)) {
11239                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11240                     *character = (U8) code_point;
11241                 }
11242                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11243                           ASCII, which isn't the same thing as INVARIANT on
11244                           EBCDIC, but it works there, as the extra invariants
11245                           fold to themselves) */
11246                     *character = toFOLD((U8) code_point);
11247
11248                     /* We can downgrade to an EXACT node if this character
11249                      * isn't a folding one.  Note that this assumes that
11250                      * nothing above Latin1 folds to some other invariant than
11251                      * one of these alphabetics; otherwise we would also have
11252                      * to check:
11253                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11254                      *      || ASCII_FOLD_RESTRICTED))
11255                      */
11256                     if (downgradable && PL_fold[code_point] == code_point) {
11257                         OP(node) = EXACT;
11258                     }
11259                 }
11260                 len = 1;
11261             }
11262             else if (FOLD && (! LOC
11263                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11264             {   /* Folding, and ok to do so now */
11265                 UV folded = _to_uni_fold_flags(
11266                                    code_point,
11267                                    character,
11268                                    &len,
11269                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11270                                                       ? FOLD_FLAGS_NOMIX_ASCII
11271                                                       : 0));
11272                 if (downgradable
11273                     && folded == code_point /* This quickly rules out many
11274                                                cases, avoiding the
11275                                                _invlist_contains_cp() overhead
11276                                                for those.  */
11277                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11278                 {
11279                     OP(node) = EXACT;
11280                 }
11281             }
11282             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11283
11284                 /* Not folding this cp, and can output it directly */
11285                 *character = UTF8_TWO_BYTE_HI(code_point);
11286                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11287                 len = 2;
11288             }
11289             else {
11290                 uvchr_to_utf8( character, code_point);
11291                 len = UTF8SKIP(character);
11292             }
11293         } /* Else pattern isn't UTF8.  */
11294         else if (! FOLD) {
11295             *character = (U8) code_point;
11296             len = 1;
11297         } /* Else is folded non-UTF8 */
11298         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11299
11300             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11301              * comments at join_exact()); */
11302             *character = (U8) code_point;
11303             len = 1;
11304
11305             /* Can turn into an EXACT node if we know the fold at compile time,
11306              * and it folds to itself and doesn't particpate in other folds */
11307             if (downgradable
11308                 && ! LOC
11309                 && PL_fold_latin1[code_point] == code_point
11310                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11311                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11312             {
11313                 OP(node) = EXACT;
11314             }
11315         } /* else is Sharp s.  May need to fold it */
11316         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11317             *character = 's';
11318             *(character + 1) = 's';
11319             len = 2;
11320         }
11321         else {
11322             *character = LATIN_SMALL_LETTER_SHARP_S;
11323             len = 1;
11324         }
11325     }
11326
11327     if (SIZE_ONLY) {
11328         RExC_size += STR_SZ(len);
11329     }
11330     else {
11331         RExC_emit += STR_SZ(len);
11332         STR_LEN(node) = len;
11333         if (! len_passed_in) {
11334             Copy((char *) character, STRING(node), len, char);
11335         }
11336     }
11337
11338     *flagp |= HASWIDTH;
11339
11340     /* A single character node is SIMPLE, except for the special-cased SHARP S
11341      * under /di. */
11342     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11343         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11344             || ! FOLD || ! DEPENDS_SEMANTICS))
11345     {
11346         *flagp |= SIMPLE;
11347     }
11348
11349     /* The OP may not be well defined in PASS1 */
11350     if (PASS2 && OP(node) == EXACTFL) {
11351         RExC_contains_locale = 1;
11352     }
11353 }
11354
11355
11356 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11357  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11358
11359 static I32
11360 S_backref_value(char *p)
11361 {
11362     const char* endptr;
11363     UV val = grok_atou(p, &endptr);
11364     if (endptr == p || endptr == NULL || val > I32_MAX)
11365         return I32_MAX;
11366     return (I32)val;
11367 }
11368
11369
11370 /*
11371  - regatom - the lowest level
11372
11373    Try to identify anything special at the start of the pattern. If there
11374    is, then handle it as required. This may involve generating a single regop,
11375    such as for an assertion; or it may involve recursing, such as to
11376    handle a () structure.
11377
11378    If the string doesn't start with something special then we gobble up
11379    as much literal text as we can.
11380
11381    Once we have been able to handle whatever type of thing started the
11382    sequence, we return.
11383
11384    Note: we have to be careful with escapes, as they can be both literal
11385    and special, and in the case of \10 and friends, context determines which.
11386
11387    A summary of the code structure is:
11388
11389    switch (first_byte) {
11390         cases for each special:
11391             handle this special;
11392             break;
11393         case '\\':
11394             switch (2nd byte) {
11395                 cases for each unambiguous special:
11396                     handle this special;
11397                     break;
11398                 cases for each ambigous special/literal:
11399                     disambiguate;
11400                     if (special)  handle here
11401                     else goto defchar;
11402                 default: // unambiguously literal:
11403                     goto defchar;
11404             }
11405         default:  // is a literal char
11406             // FALL THROUGH
11407         defchar:
11408             create EXACTish node for literal;
11409             while (more input and node isn't full) {
11410                 switch (input_byte) {
11411                    cases for each special;
11412                        make sure parse pointer is set so that the next call to
11413                            regatom will see this special first
11414                        goto loopdone; // EXACTish node terminated by prev. char
11415                    default:
11416                        append char to EXACTISH node;
11417                 }
11418                 get next input byte;
11419             }
11420         loopdone:
11421    }
11422    return the generated node;
11423
11424    Specifically there are two separate switches for handling
11425    escape sequences, with the one for handling literal escapes requiring
11426    a dummy entry for all of the special escapes that are actually handled
11427    by the other.
11428
11429    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11430    TRYAGAIN.
11431    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11432    restarted.
11433    Otherwise does not return NULL.
11434 */
11435
11436 STATIC regnode *
11437 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11438 {
11439     regnode *ret = NULL;
11440     I32 flags = 0;
11441     char *parse_start = RExC_parse;
11442     U8 op;
11443     int invert = 0;
11444     U8 arg;
11445
11446     GET_RE_DEBUG_FLAGS_DECL;
11447
11448     *flagp = WORST;             /* Tentatively. */
11449
11450     DEBUG_PARSE("atom");
11451
11452     PERL_ARGS_ASSERT_REGATOM;
11453
11454 tryagain:
11455     switch ((U8)*RExC_parse) {
11456     case '^':
11457         RExC_seen_zerolen++;
11458         nextchar(pRExC_state);
11459         if (RExC_flags & RXf_PMf_MULTILINE)
11460             ret = reg_node(pRExC_state, MBOL);
11461         else
11462             ret = reg_node(pRExC_state, SBOL);
11463         Set_Node_Length(ret, 1); /* MJD */
11464         break;
11465     case '$':
11466         nextchar(pRExC_state);
11467         if (*RExC_parse)
11468             RExC_seen_zerolen++;
11469         if (RExC_flags & RXf_PMf_MULTILINE)
11470             ret = reg_node(pRExC_state, MEOL);
11471         else
11472             ret = reg_node(pRExC_state, SEOL);
11473         Set_Node_Length(ret, 1); /* MJD */
11474         break;
11475     case '.':
11476         nextchar(pRExC_state);
11477         if (RExC_flags & RXf_PMf_SINGLELINE)
11478             ret = reg_node(pRExC_state, SANY);
11479         else
11480             ret = reg_node(pRExC_state, REG_ANY);
11481         *flagp |= HASWIDTH|SIMPLE;
11482         RExC_naughty++;
11483         Set_Node_Length(ret, 1); /* MJD */
11484         break;
11485     case '[':
11486     {
11487         char * const oregcomp_parse = ++RExC_parse;
11488         ret = regclass(pRExC_state, flagp,depth+1,
11489                        FALSE, /* means parse the whole char class */
11490                        TRUE, /* allow multi-char folds */
11491                        FALSE, /* don't silence non-portable warnings. */
11492                        NULL);
11493         if (*RExC_parse != ']') {
11494             RExC_parse = oregcomp_parse;
11495             vFAIL("Unmatched [");
11496         }
11497         if (ret == NULL) {
11498             if (*flagp & RESTART_UTF8)
11499                 return NULL;
11500             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11501                   (UV) *flagp);
11502         }
11503         nextchar(pRExC_state);
11504         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11505         break;
11506     }
11507     case '(':
11508         nextchar(pRExC_state);
11509         ret = reg(pRExC_state, 2, &flags,depth+1);
11510         if (ret == NULL) {
11511                 if (flags & TRYAGAIN) {
11512                     if (RExC_parse == RExC_end) {
11513                          /* Make parent create an empty node if needed. */
11514                         *flagp |= TRYAGAIN;
11515                         return(NULL);
11516                     }
11517                     goto tryagain;
11518                 }
11519                 if (flags & RESTART_UTF8) {
11520                     *flagp = RESTART_UTF8;
11521                     return NULL;
11522                 }
11523                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11524                                                                  (UV) flags);
11525         }
11526         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11527         break;
11528     case '|':
11529     case ')':
11530         if (flags & TRYAGAIN) {
11531             *flagp |= TRYAGAIN;
11532             return NULL;
11533         }
11534         vFAIL("Internal urp");
11535                                 /* Supposed to be caught earlier. */
11536         break;
11537     case '?':
11538     case '+':
11539     case '*':
11540         RExC_parse++;
11541         vFAIL("Quantifier follows nothing");
11542         break;
11543     case '\\':
11544         /* Special Escapes
11545
11546            This switch handles escape sequences that resolve to some kind
11547            of special regop and not to literal text. Escape sequnces that
11548            resolve to literal text are handled below in the switch marked
11549            "Literal Escapes".
11550
11551            Every entry in this switch *must* have a corresponding entry
11552            in the literal escape switch. However, the opposite is not
11553            required, as the default for this switch is to jump to the
11554            literal text handling code.
11555         */
11556         switch ((U8)*++RExC_parse) {
11557         /* Special Escapes */
11558         case 'A':
11559             RExC_seen_zerolen++;
11560             ret = reg_node(pRExC_state, SBOL);
11561             /* SBOL is shared with /^/ so we set the flags so we can tell
11562              * /\A/ from /^/ in split. We check ret because first pass we
11563              * have no regop struct to set the flags on. */
11564             if (PASS2)
11565                 ret->flags = 1;
11566             *flagp |= SIMPLE;
11567             goto finish_meta_pat;
11568         case 'G':
11569             ret = reg_node(pRExC_state, GPOS);
11570             RExC_seen |= REG_GPOS_SEEN;
11571             *flagp |= SIMPLE;
11572             goto finish_meta_pat;
11573         case 'K':
11574             RExC_seen_zerolen++;
11575             ret = reg_node(pRExC_state, KEEPS);
11576             *flagp |= SIMPLE;
11577             /* XXX:dmq : disabling in-place substitution seems to
11578              * be necessary here to avoid cases of memory corruption, as
11579              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11580              */
11581             RExC_seen |= REG_LOOKBEHIND_SEEN;
11582             goto finish_meta_pat;
11583         case 'Z':
11584             ret = reg_node(pRExC_state, SEOL);
11585             *flagp |= SIMPLE;
11586             RExC_seen_zerolen++;                /* Do not optimize RE away */
11587             goto finish_meta_pat;
11588         case 'z':
11589             ret = reg_node(pRExC_state, EOS);
11590             *flagp |= SIMPLE;
11591             RExC_seen_zerolen++;                /* Do not optimize RE away */
11592             goto finish_meta_pat;
11593         case 'C':
11594             ret = reg_node(pRExC_state, CANY);
11595             RExC_seen |= REG_CANY_SEEN;
11596             *flagp |= HASWIDTH|SIMPLE;
11597             if (PASS2) {
11598                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11599             }
11600             goto finish_meta_pat;
11601         case 'X':
11602             ret = reg_node(pRExC_state, CLUMP);
11603             *flagp |= HASWIDTH;
11604             goto finish_meta_pat;
11605
11606         case 'W':
11607             invert = 1;
11608             /* FALLTHROUGH */
11609         case 'w':
11610             arg = ANYOF_WORDCHAR;
11611             goto join_posix;
11612
11613         case 'b':
11614             RExC_seen_zerolen++;
11615             RExC_seen |= REG_LOOKBEHIND_SEEN;
11616             op = BOUND + get_regex_charset(RExC_flags);
11617             if (op > BOUNDA) {  /* /aa is same as /a */
11618                 op = BOUNDA;
11619             }
11620             else if (op == BOUNDL) {
11621                 RExC_contains_locale = 1;
11622             }
11623             ret = reg_node(pRExC_state, op);
11624             FLAGS(ret) = get_regex_charset(RExC_flags);
11625             *flagp |= SIMPLE;
11626             if ((U8) *(RExC_parse + 1) == '{') {
11627                 /* diag_listed_as: Use "%s" instead of "%s" */
11628                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11629             }
11630             goto finish_meta_pat;
11631         case 'B':
11632             RExC_seen_zerolen++;
11633             RExC_seen |= REG_LOOKBEHIND_SEEN;
11634             op = NBOUND + get_regex_charset(RExC_flags);
11635             if (op > NBOUNDA) { /* /aa is same as /a */
11636                 op = NBOUNDA;
11637             }
11638             else if (op == NBOUNDL) {
11639                 RExC_contains_locale = 1;
11640             }
11641             ret = reg_node(pRExC_state, op);
11642             FLAGS(ret) = get_regex_charset(RExC_flags);
11643             *flagp |= SIMPLE;
11644             if ((U8) *(RExC_parse + 1) == '{') {
11645                 /* diag_listed_as: Use "%s" instead of "%s" */
11646                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11647             }
11648             goto finish_meta_pat;
11649
11650         case 'D':
11651             invert = 1;
11652             /* FALLTHROUGH */
11653         case 'd':
11654             arg = ANYOF_DIGIT;
11655             goto join_posix;
11656
11657         case 'R':
11658             ret = reg_node(pRExC_state, LNBREAK);
11659             *flagp |= HASWIDTH|SIMPLE;
11660             goto finish_meta_pat;
11661
11662         case 'H':
11663             invert = 1;
11664             /* FALLTHROUGH */
11665         case 'h':
11666             arg = ANYOF_BLANK;
11667             op = POSIXU;
11668             goto join_posix_op_known;
11669
11670         case 'V':
11671             invert = 1;
11672             /* FALLTHROUGH */
11673         case 'v':
11674             arg = ANYOF_VERTWS;
11675             op = POSIXU;
11676             goto join_posix_op_known;
11677
11678         case 'S':
11679             invert = 1;
11680             /* FALLTHROUGH */
11681         case 's':
11682             arg = ANYOF_SPACE;
11683
11684         join_posix:
11685
11686             op = POSIXD + get_regex_charset(RExC_flags);
11687             if (op > POSIXA) {  /* /aa is same as /a */
11688                 op = POSIXA;
11689             }
11690             else if (op == POSIXL) {
11691                 RExC_contains_locale = 1;
11692             }
11693
11694         join_posix_op_known:
11695
11696             if (invert) {
11697                 op += NPOSIXD - POSIXD;
11698             }
11699
11700             ret = reg_node(pRExC_state, op);
11701             if (! SIZE_ONLY) {
11702                 FLAGS(ret) = namedclass_to_classnum(arg);
11703             }
11704
11705             *flagp |= HASWIDTH|SIMPLE;
11706             /* FALLTHROUGH */
11707
11708          finish_meta_pat:
11709             nextchar(pRExC_state);
11710             Set_Node_Length(ret, 2); /* MJD */
11711             break;
11712         case 'p':
11713         case 'P':
11714             {
11715 #ifdef DEBUGGING
11716                 char* parse_start = RExC_parse - 2;
11717 #endif
11718
11719                 RExC_parse--;
11720
11721                 ret = regclass(pRExC_state, flagp,depth+1,
11722                                TRUE, /* means just parse this element */
11723                                FALSE, /* don't allow multi-char folds */
11724                                FALSE, /* don't silence non-portable warnings.
11725                                          It would be a bug if these returned
11726                                          non-portables */
11727                                NULL);
11728                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11729                    are allowed.  */
11730                 if (!ret)
11731                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11732                           (UV) *flagp);
11733
11734                 RExC_parse--;
11735
11736                 Set_Node_Offset(ret, parse_start + 2);
11737                 Set_Node_Cur_Length(ret, parse_start);
11738                 nextchar(pRExC_state);
11739             }
11740             break;
11741         case 'N':
11742             /* Handle \N and \N{NAME} with multiple code points here and not
11743              * below because it can be multicharacter. join_exact() will join
11744              * them up later on.  Also this makes sure that things like
11745              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11746              * The options to the grok function call causes it to fail if the
11747              * sequence is just a single code point.  We then go treat it as
11748              * just another character in the current EXACT node, and hence it
11749              * gets uniform treatment with all the other characters.  The
11750              * special treatment for quantifiers is not needed for such single
11751              * character sequences */
11752             ++RExC_parse;
11753             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11754                                              depth, FALSE))
11755             {
11756                 if (*flagp & RESTART_UTF8)
11757                     return NULL;
11758                 RExC_parse--;
11759                 goto defchar;
11760             }
11761             break;
11762         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11763         parse_named_seq:
11764         {
11765             char ch= RExC_parse[1];
11766             if (ch != '<' && ch != '\'' && ch != '{') {
11767                 RExC_parse++;
11768                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11769                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11770             } else {
11771                 /* this pretty much dupes the code for (?P=...) in reg(), if
11772                    you change this make sure you change that */
11773                 char* name_start = (RExC_parse += 2);
11774                 U32 num = 0;
11775                 SV *sv_dat = reg_scan_name(pRExC_state,
11776                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11777                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11778                 if (RExC_parse == name_start || *RExC_parse != ch)
11779                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11780                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11781
11782                 if (!SIZE_ONLY) {
11783                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11784                     RExC_rxi->data->data[num]=(void*)sv_dat;
11785                     SvREFCNT_inc_simple_void(sv_dat);
11786                 }
11787
11788                 RExC_sawback = 1;
11789                 ret = reganode(pRExC_state,
11790                                ((! FOLD)
11791                                  ? NREF
11792                                  : (ASCII_FOLD_RESTRICTED)
11793                                    ? NREFFA
11794                                    : (AT_LEAST_UNI_SEMANTICS)
11795                                      ? NREFFU
11796                                      : (LOC)
11797                                        ? NREFFL
11798                                        : NREFF),
11799                                 num);
11800                 *flagp |= HASWIDTH;
11801
11802                 /* override incorrect value set in reganode MJD */
11803                 Set_Node_Offset(ret, parse_start+1);
11804                 Set_Node_Cur_Length(ret, parse_start);
11805                 nextchar(pRExC_state);
11806
11807             }
11808             break;
11809         }
11810         case 'g':
11811         case '1': case '2': case '3': case '4':
11812         case '5': case '6': case '7': case '8': case '9':
11813             {
11814                 I32 num;
11815                 bool hasbrace = 0;
11816
11817                 if (*RExC_parse == 'g') {
11818                     bool isrel = 0;
11819
11820                     RExC_parse++;
11821                     if (*RExC_parse == '{') {
11822                         RExC_parse++;
11823                         hasbrace = 1;
11824                     }
11825                     if (*RExC_parse == '-') {
11826                         RExC_parse++;
11827                         isrel = 1;
11828                     }
11829                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11830                         if (isrel) RExC_parse--;
11831                         RExC_parse -= 2;
11832                         goto parse_named_seq;
11833                     }
11834
11835                     num = S_backref_value(RExC_parse);
11836                     if (num == 0)
11837                         vFAIL("Reference to invalid group 0");
11838                     else if (num == I32_MAX) {
11839                          if (isDIGIT(*RExC_parse))
11840                             vFAIL("Reference to nonexistent group");
11841                         else
11842                             vFAIL("Unterminated \\g... pattern");
11843                     }
11844
11845                     if (isrel) {
11846                         num = RExC_npar - num;
11847                         if (num < 1)
11848                             vFAIL("Reference to nonexistent or unclosed group");
11849                     }
11850                 }
11851                 else {
11852                     num = S_backref_value(RExC_parse);
11853                     /* bare \NNN might be backref or octal - if it is larger than or equal
11854                      * RExC_npar then it is assumed to be and octal escape.
11855                      * Note RExC_npar is +1 from the actual number of parens*/
11856                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11857                             && *RExC_parse != '8' && *RExC_parse != '9'))
11858                     {
11859                         /* Probably a character specified in octal, e.g. \35 */
11860                         goto defchar;
11861                     }
11862                 }
11863
11864                 /* at this point RExC_parse definitely points to a backref
11865                  * number */
11866                 {
11867 #ifdef RE_TRACK_PATTERN_OFFSETS
11868                     char * const parse_start = RExC_parse - 1; /* MJD */
11869 #endif
11870                     while (isDIGIT(*RExC_parse))
11871                         RExC_parse++;
11872                     if (hasbrace) {
11873                         if (*RExC_parse != '}')
11874                             vFAIL("Unterminated \\g{...} pattern");
11875                         RExC_parse++;
11876                     }
11877                     if (!SIZE_ONLY) {
11878                         if (num > (I32)RExC_rx->nparens)
11879                             vFAIL("Reference to nonexistent group");
11880                     }
11881                     RExC_sawback = 1;
11882                     ret = reganode(pRExC_state,
11883                                    ((! FOLD)
11884                                      ? REF
11885                                      : (ASCII_FOLD_RESTRICTED)
11886                                        ? REFFA
11887                                        : (AT_LEAST_UNI_SEMANTICS)
11888                                          ? REFFU
11889                                          : (LOC)
11890                                            ? REFFL
11891                                            : REFF),
11892                                     num);
11893                     *flagp |= HASWIDTH;
11894
11895                     /* override incorrect value set in reganode MJD */
11896                     Set_Node_Offset(ret, parse_start+1);
11897                     Set_Node_Cur_Length(ret, parse_start);
11898                     RExC_parse--;
11899                     nextchar(pRExC_state);
11900                 }
11901             }
11902             break;
11903         case '\0':
11904             if (RExC_parse >= RExC_end)
11905                 FAIL("Trailing \\");
11906             /* FALLTHROUGH */
11907         default:
11908             /* Do not generate "unrecognized" warnings here, we fall
11909                back into the quick-grab loop below */
11910             parse_start--;
11911             goto defchar;
11912         }
11913         break;
11914
11915     case '#':
11916         if (RExC_flags & RXf_PMf_EXTENDED) {
11917             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11918             if (RExC_parse < RExC_end)
11919                 goto tryagain;
11920         }
11921         /* FALLTHROUGH */
11922
11923     default:
11924
11925             parse_start = RExC_parse - 1;
11926
11927             RExC_parse++;
11928
11929         defchar: {
11930             STRLEN len = 0;
11931             UV ender = 0;
11932             char *p;
11933             char *s;
11934 #define MAX_NODE_STRING_SIZE 127
11935             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11936             char *s0;
11937             U8 upper_parse = MAX_NODE_STRING_SIZE;
11938             U8 node_type = compute_EXACTish(pRExC_state);
11939             bool next_is_quantifier;
11940             char * oldp = NULL;
11941
11942             /* We can convert EXACTF nodes to EXACTFU if they contain only
11943              * characters that match identically regardless of the target
11944              * string's UTF8ness.  The reason to do this is that EXACTF is not
11945              * trie-able, EXACTFU is.
11946              *
11947              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11948              * contain only above-Latin1 characters (hence must be in UTF8),
11949              * which don't participate in folds with Latin1-range characters,
11950              * as the latter's folds aren't known until runtime.  (We don't
11951              * need to figure this out until pass 2) */
11952             bool maybe_exactfu = PASS2
11953                                && (node_type == EXACTF || node_type == EXACTFL);
11954
11955             /* If a folding node contains only code points that don't
11956              * participate in folds, it can be changed into an EXACT node,
11957              * which allows the optimizer more things to look for */
11958             bool maybe_exact;
11959
11960             ret = reg_node(pRExC_state, node_type);
11961
11962             /* In pass1, folded, we use a temporary buffer instead of the
11963              * actual node, as the node doesn't exist yet */
11964             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11965
11966             s0 = s;
11967
11968         reparse:
11969
11970             /* We do the EXACTFish to EXACT node only if folding.  (And we
11971              * don't need to figure this out until pass 2) */
11972             maybe_exact = FOLD && PASS2;
11973
11974             /* XXX The node can hold up to 255 bytes, yet this only goes to
11975              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11976              * 255 allows us to not have to worry about overflow due to
11977              * converting to utf8 and fold expansion, but that value is
11978              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11979              * split up by this limit into a single one using the real max of
11980              * 255.  Even at 127, this breaks under rare circumstances.  If
11981              * folding, we do not want to split a node at a character that is a
11982              * non-final in a multi-char fold, as an input string could just
11983              * happen to want to match across the node boundary.  The join
11984              * would solve that problem if the join actually happens.  But a
11985              * series of more than two nodes in a row each of 127 would cause
11986              * the first join to succeed to get to 254, but then there wouldn't
11987              * be room for the next one, which could at be one of those split
11988              * multi-char folds.  I don't know of any fool-proof solution.  One
11989              * could back off to end with only a code point that isn't such a
11990              * non-final, but it is possible for there not to be any in the
11991              * entire node. */
11992             for (p = RExC_parse - 1;
11993                  len < upper_parse && p < RExC_end;
11994                  len++)
11995             {
11996                 oldp = p;
11997
11998                 if (RExC_flags & RXf_PMf_EXTENDED)
11999                     p = regpatws(pRExC_state, p,
12000                                           TRUE); /* means recognize comments */
12001                 switch ((U8)*p) {
12002                 case '^':
12003                 case '$':
12004                 case '.':
12005                 case '[':
12006                 case '(':
12007                 case ')':
12008                 case '|':
12009                     goto loopdone;
12010                 case '\\':
12011                     /* Literal Escapes Switch
12012
12013                        This switch is meant to handle escape sequences that
12014                        resolve to a literal character.
12015
12016                        Every escape sequence that represents something
12017                        else, like an assertion or a char class, is handled
12018                        in the switch marked 'Special Escapes' above in this
12019                        routine, but also has an entry here as anything that
12020                        isn't explicitly mentioned here will be treated as
12021                        an unescaped equivalent literal.
12022                     */
12023
12024                     switch ((U8)*++p) {
12025                     /* These are all the special escapes. */
12026                     case 'A':             /* Start assertion */
12027                     case 'b': case 'B':   /* Word-boundary assertion*/
12028                     case 'C':             /* Single char !DANGEROUS! */
12029                     case 'd': case 'D':   /* digit class */
12030                     case 'g': case 'G':   /* generic-backref, pos assertion */
12031                     case 'h': case 'H':   /* HORIZWS */
12032                     case 'k': case 'K':   /* named backref, keep marker */
12033                     case 'p': case 'P':   /* Unicode property */
12034                               case 'R':   /* LNBREAK */
12035                     case 's': case 'S':   /* space class */
12036                     case 'v': case 'V':   /* VERTWS */
12037                     case 'w': case 'W':   /* word class */
12038                     case 'X':             /* eXtended Unicode "combining
12039                                              character sequence" */
12040                     case 'z': case 'Z':   /* End of line/string assertion */
12041                         --p;
12042                         goto loopdone;
12043
12044                     /* Anything after here is an escape that resolves to a
12045                        literal. (Except digits, which may or may not)
12046                      */
12047                     case 'n':
12048                         ender = '\n';
12049                         p++;
12050                         break;
12051                     case 'N': /* Handle a single-code point named character. */
12052                         /* The options cause it to fail if a multiple code
12053                          * point sequence.  Handle those in the switch() above
12054                          * */
12055                         RExC_parse = p + 1;
12056                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12057                                                          &ender,
12058                                                          flagp,
12059                                                          depth,
12060                                                          FALSE
12061                         )) {
12062                             if (*flagp & RESTART_UTF8)
12063                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12064                             RExC_parse = p = oldp;
12065                             goto loopdone;
12066                         }
12067                         p = RExC_parse;
12068                         if (ender > 0xff) {
12069                             REQUIRE_UTF8;
12070                         }
12071                         break;
12072                     case 'r':
12073                         ender = '\r';
12074                         p++;
12075                         break;
12076                     case 't':
12077                         ender = '\t';
12078                         p++;
12079                         break;
12080                     case 'f':
12081                         ender = '\f';
12082                         p++;
12083                         break;
12084                     case 'e':
12085                         ender = ESC_NATIVE;
12086                         p++;
12087                         break;
12088                     case 'a':
12089                         ender = '\a';
12090                         p++;
12091                         break;
12092                     case 'o':
12093                         {
12094                             UV result;
12095                             const char* error_msg;
12096
12097                             bool valid = grok_bslash_o(&p,
12098                                                        &result,
12099                                                        &error_msg,
12100                                                        PASS2, /* out warnings */
12101                                                        FALSE, /* not strict */
12102                                                        TRUE, /* Output warnings
12103                                                                 for non-
12104                                                                 portables */
12105                                                        UTF);
12106                             if (! valid) {
12107                                 RExC_parse = p; /* going to die anyway; point
12108                                                    to exact spot of failure */
12109                                 vFAIL(error_msg);
12110                             }
12111                             ender = result;
12112                             if (PL_encoding && ender < 0x100) {
12113                                 goto recode_encoding;
12114                             }
12115                             if (ender > 0xff) {
12116                                 REQUIRE_UTF8;
12117                             }
12118                             break;
12119                         }
12120                     case 'x':
12121                         {
12122                             UV result = UV_MAX; /* initialize to erroneous
12123                                                    value */
12124                             const char* error_msg;
12125
12126                             bool valid = grok_bslash_x(&p,
12127                                                        &result,
12128                                                        &error_msg,
12129                                                        PASS2, /* out warnings */
12130                                                        FALSE, /* not strict */
12131                                                        TRUE, /* Output warnings
12132                                                                 for non-
12133                                                                 portables */
12134                                                        UTF);
12135                             if (! valid) {
12136                                 RExC_parse = p; /* going to die anyway; point
12137                                                    to exact spot of failure */
12138                                 vFAIL(error_msg);
12139                             }
12140                             ender = result;
12141
12142                             if (PL_encoding && ender < 0x100) {
12143                                 goto recode_encoding;
12144                             }
12145                             if (ender > 0xff) {
12146                                 REQUIRE_UTF8;
12147                             }
12148                             break;
12149                         }
12150                     case 'c':
12151                         p++;
12152                         ender = grok_bslash_c(*p++, PASS2);
12153                         break;
12154                     case '8': case '9': /* must be a backreference */
12155                         --p;
12156                         goto loopdone;
12157                     case '1': case '2': case '3':case '4':
12158                     case '5': case '6': case '7':
12159                         /* When we parse backslash escapes there is ambiguity
12160                          * between backreferences and octal escapes. Any escape
12161                          * from \1 - \9 is a backreference, any multi-digit
12162                          * escape which does not start with 0 and which when
12163                          * evaluated as decimal could refer to an already
12164                          * parsed capture buffer is a backslash. Anything else
12165                          * is octal.
12166                          *
12167                          * Note this implies that \118 could be interpreted as
12168                          * 118 OR as "\11" . "8" depending on whether there
12169                          * were 118 capture buffers defined already in the
12170                          * pattern.  */
12171
12172                         /* NOTE, RExC_npar is 1 more than the actual number of
12173                          * parens we have seen so far, hence the < RExC_npar below. */
12174
12175                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12176                         {  /* Not to be treated as an octal constant, go
12177                                    find backref */
12178                             --p;
12179                             goto loopdone;
12180                         }
12181                         /* FALLTHROUGH */
12182                     case '0':
12183                         {
12184                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12185                             STRLEN numlen = 3;
12186                             ender = grok_oct(p, &numlen, &flags, NULL);
12187                             if (ender > 0xff) {
12188                                 REQUIRE_UTF8;
12189                             }
12190                             p += numlen;
12191                             if (PASS2   /* like \08, \178 */
12192                                 && numlen < 3
12193                                 && p < RExC_end
12194                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12195                             {
12196                                 reg_warn_non_literal_string(
12197                                          p + 1,
12198                                          form_short_octal_warning(p, numlen));
12199                             }
12200                         }
12201                         if (PL_encoding && ender < 0x100)
12202                             goto recode_encoding;
12203                         break;
12204                     recode_encoding:
12205                         if (! RExC_override_recoding) {
12206                             SV* enc = PL_encoding;
12207                             ender = reg_recode((const char)(U8)ender, &enc);
12208                             if (!enc && PASS2)
12209                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12210                             REQUIRE_UTF8;
12211                         }
12212                         break;
12213                     case '\0':
12214                         if (p >= RExC_end)
12215                             FAIL("Trailing \\");
12216                         /* FALLTHROUGH */
12217                     default:
12218                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12219                             /* Include any { following the alpha to emphasize
12220                              * that it could be part of an escape at some point
12221                              * in the future */
12222                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12223                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12224                         }
12225                         goto normal_default;
12226                     } /* End of switch on '\' */
12227                     break;
12228                 case '{':
12229                     /* Currently we don't warn when the lbrace is at the start
12230                      * of a construct.  This catches it in the middle of a
12231                      * literal string, or when its the first thing after
12232                      * something like "\b" */
12233                     if (! SIZE_ONLY
12234                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12235                     {
12236                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12237                     }
12238                     /*FALLTHROUGH*/
12239                 default:    /* A literal character */
12240                   normal_default:
12241                     if (UTF8_IS_START(*p) && UTF) {
12242                         STRLEN numlen;
12243                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12244                                                &numlen, UTF8_ALLOW_DEFAULT);
12245                         p += numlen;
12246                     }
12247                     else
12248                         ender = (U8) *p++;
12249                     break;
12250                 } /* End of switch on the literal */
12251
12252                 /* Here, have looked at the literal character and <ender>
12253                  * contains its ordinal, <p> points to the character after it
12254                  */
12255
12256                 if ( RExC_flags & RXf_PMf_EXTENDED)
12257                     p = regpatws(pRExC_state, p,
12258                                           TRUE); /* means recognize comments */
12259
12260                 /* If the next thing is a quantifier, it applies to this
12261                  * character only, which means that this character has to be in
12262                  * its own node and can't just be appended to the string in an
12263                  * existing node, so if there are already other characters in
12264                  * the node, close the node with just them, and set up to do
12265                  * this character again next time through, when it will be the
12266                  * only thing in its new node */
12267                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12268                 {
12269                     p = oldp;
12270                     goto loopdone;
12271                 }
12272
12273                 if (! FOLD   /* The simple case, just append the literal */
12274                     || (LOC  /* Also don't fold for tricky chars under /l */
12275                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12276                 {
12277                     if (UTF) {
12278                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12279                         if (unilen > 0) {
12280                            s   += unilen;
12281                            len += unilen;
12282                         }
12283
12284                         /* The loop increments <len> each time, as all but this
12285                          * path (and one other) through it add a single byte to
12286                          * the EXACTish node.  But this one has changed len to
12287                          * be the correct final value, so subtract one to
12288                          * cancel out the increment that follows */
12289                         len--;
12290                     }
12291                     else {
12292                         REGC((char)ender, s++);
12293                     }
12294
12295                     /* Can get here if folding only if is one of the /l
12296                      * characters whose fold depends on the locale.  The
12297                      * occurrence of any of these indicate that we can't
12298                      * simplify things */
12299                     if (FOLD) {
12300                         maybe_exact = FALSE;
12301                         maybe_exactfu = FALSE;
12302                     }
12303                 }
12304                 else             /* FOLD */
12305                      if (! ( UTF
12306                         /* See comments for join_exact() as to why we fold this
12307                          * non-UTF at compile time */
12308                         || (node_type == EXACTFU
12309                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12310                 {
12311                     /* Here, are folding and are not UTF-8 encoded; therefore
12312                      * the character must be in the range 0-255, and is not /l
12313                      * (Not /l because we already handled these under /l in
12314                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12315                     if (IS_IN_SOME_FOLD_L1(ender)) {
12316                         maybe_exact = FALSE;
12317
12318                         /* See if the character's fold differs between /d and
12319                          * /u.  This includes the multi-char fold SHARP S to
12320                          * 'ss' */
12321                         if (maybe_exactfu
12322                             && (PL_fold[ender] != PL_fold_latin1[ender]
12323                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12324                                 || (len > 0
12325                                    && isALPHA_FOLD_EQ(ender, 's')
12326                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12327                         {
12328                             maybe_exactfu = FALSE;
12329                         }
12330                     }
12331
12332                     /* Even when folding, we store just the input character, as
12333                      * we have an array that finds its fold quickly */
12334                     *(s++) = (char) ender;
12335                 }
12336                 else {  /* FOLD and UTF */
12337                     /* Unlike the non-fold case, we do actually have to
12338                      * calculate the results here in pass 1.  This is for two
12339                      * reasons, the folded length may be longer than the
12340                      * unfolded, and we have to calculate how many EXACTish
12341                      * nodes it will take; and we may run out of room in a node
12342                      * in the middle of a potential multi-char fold, and have
12343                      * to back off accordingly.  (Hence we can't use REGC for
12344                      * the simple case just below.) */
12345
12346                     UV folded;
12347                     if (isASCII(ender)) {
12348                         folded = toFOLD(ender);
12349                         *(s)++ = (U8) folded;
12350                     }
12351                     else {
12352                         STRLEN foldlen;
12353
12354                         folded = _to_uni_fold_flags(
12355                                      ender,
12356                                      (U8 *) s,
12357                                      &foldlen,
12358                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12359                                                         ? FOLD_FLAGS_NOMIX_ASCII
12360                                                         : 0));
12361                         s += foldlen;
12362
12363                         /* The loop increments <len> each time, as all but this
12364                          * path (and one other) through it add a single byte to
12365                          * the EXACTish node.  But this one has changed len to
12366                          * be the correct final value, so subtract one to
12367                          * cancel out the increment that follows */
12368                         len += foldlen - 1;
12369                     }
12370                     /* If this node only contains non-folding code points so
12371                      * far, see if this new one is also non-folding */
12372                     if (maybe_exact) {
12373                         if (folded != ender) {
12374                             maybe_exact = FALSE;
12375                         }
12376                         else {
12377                             /* Here the fold is the original; we have to check
12378                              * further to see if anything folds to it */
12379                             if (_invlist_contains_cp(PL_utf8_foldable,
12380                                                         ender))
12381                             {
12382                                 maybe_exact = FALSE;
12383                             }
12384                         }
12385                     }
12386                     ender = folded;
12387                 }
12388
12389                 if (next_is_quantifier) {
12390
12391                     /* Here, the next input is a quantifier, and to get here,
12392                      * the current character is the only one in the node.
12393                      * Also, here <len> doesn't include the final byte for this
12394                      * character */
12395                     len++;
12396                     goto loopdone;
12397                 }
12398
12399             } /* End of loop through literal characters */
12400
12401             /* Here we have either exhausted the input or ran out of room in
12402              * the node.  (If we encountered a character that can't be in the
12403              * node, transfer is made directly to <loopdone>, and so we
12404              * wouldn't have fallen off the end of the loop.)  In the latter
12405              * case, we artificially have to split the node into two, because
12406              * we just don't have enough space to hold everything.  This
12407              * creates a problem if the final character participates in a
12408              * multi-character fold in the non-final position, as a match that
12409              * should have occurred won't, due to the way nodes are matched,
12410              * and our artificial boundary.  So back off until we find a non-
12411              * problematic character -- one that isn't at the beginning or
12412              * middle of such a fold.  (Either it doesn't participate in any
12413              * folds, or appears only in the final position of all the folds it
12414              * does participate in.)  A better solution with far fewer false
12415              * positives, and that would fill the nodes more completely, would
12416              * be to actually have available all the multi-character folds to
12417              * test against, and to back-off only far enough to be sure that
12418              * this node isn't ending with a partial one.  <upper_parse> is set
12419              * further below (if we need to reparse the node) to include just
12420              * up through that final non-problematic character that this code
12421              * identifies, so when it is set to less than the full node, we can
12422              * skip the rest of this */
12423             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12424
12425                 const STRLEN full_len = len;
12426
12427                 assert(len >= MAX_NODE_STRING_SIZE);
12428
12429                 /* Here, <s> points to the final byte of the final character.
12430                  * Look backwards through the string until find a non-
12431                  * problematic character */
12432
12433                 if (! UTF) {
12434
12435                     /* This has no multi-char folds to non-UTF characters */
12436                     if (ASCII_FOLD_RESTRICTED) {
12437                         goto loopdone;
12438                     }
12439
12440                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12441                     len = s - s0 + 1;
12442                 }
12443                 else {
12444                     if (!  PL_NonL1NonFinalFold) {
12445                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12446                                         NonL1_Perl_Non_Final_Folds_invlist);
12447                     }
12448
12449                     /* Point to the first byte of the final character */
12450                     s = (char *) utf8_hop((U8 *) s, -1);
12451
12452                     while (s >= s0) {   /* Search backwards until find
12453                                            non-problematic char */
12454                         if (UTF8_IS_INVARIANT(*s)) {
12455
12456                             /* There are no ascii characters that participate
12457                              * in multi-char folds under /aa.  In EBCDIC, the
12458                              * non-ascii invariants are all control characters,
12459                              * so don't ever participate in any folds. */
12460                             if (ASCII_FOLD_RESTRICTED
12461                                 || ! IS_NON_FINAL_FOLD(*s))
12462                             {
12463                                 break;
12464                             }
12465                         }
12466                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12467                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12468                                                                   *s, *(s+1))))
12469                             {
12470                                 break;
12471                             }
12472                         }
12473                         else if (! _invlist_contains_cp(
12474                                         PL_NonL1NonFinalFold,
12475                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12476                         {
12477                             break;
12478                         }
12479
12480                         /* Here, the current character is problematic in that
12481                          * it does occur in the non-final position of some
12482                          * fold, so try the character before it, but have to
12483                          * special case the very first byte in the string, so
12484                          * we don't read outside the string */
12485                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12486                     } /* End of loop backwards through the string */
12487
12488                     /* If there were only problematic characters in the string,
12489                      * <s> will point to before s0, in which case the length
12490                      * should be 0, otherwise include the length of the
12491                      * non-problematic character just found */
12492                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12493                 }
12494
12495                 /* Here, have found the final character, if any, that is
12496                  * non-problematic as far as ending the node without splitting
12497                  * it across a potential multi-char fold.  <len> contains the
12498                  * number of bytes in the node up-to and including that
12499                  * character, or is 0 if there is no such character, meaning
12500                  * the whole node contains only problematic characters.  In
12501                  * this case, give up and just take the node as-is.  We can't
12502                  * do any better */
12503                 if (len == 0) {
12504                     len = full_len;
12505
12506                     /* If the node ends in an 's' we make sure it stays EXACTF,
12507                      * as if it turns into an EXACTFU, it could later get
12508                      * joined with another 's' that would then wrongly match
12509                      * the sharp s */
12510                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12511                     {
12512                         maybe_exactfu = FALSE;
12513                     }
12514                 } else {
12515
12516                     /* Here, the node does contain some characters that aren't
12517                      * problematic.  If one such is the final character in the
12518                      * node, we are done */
12519                     if (len == full_len) {
12520                         goto loopdone;
12521                     }
12522                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12523
12524                         /* If the final character is problematic, but the
12525                          * penultimate is not, back-off that last character to
12526                          * later start a new node with it */
12527                         p = oldp;
12528                         goto loopdone;
12529                     }
12530
12531                     /* Here, the final non-problematic character is earlier
12532                      * in the input than the penultimate character.  What we do
12533                      * is reparse from the beginning, going up only as far as
12534                      * this final ok one, thus guaranteeing that the node ends
12535                      * in an acceptable character.  The reason we reparse is
12536                      * that we know how far in the character is, but we don't
12537                      * know how to correlate its position with the input parse.
12538                      * An alternate implementation would be to build that
12539                      * correlation as we go along during the original parse,
12540                      * but that would entail extra work for every node, whereas
12541                      * this code gets executed only when the string is too
12542                      * large for the node, and the final two characters are
12543                      * problematic, an infrequent occurrence.  Yet another
12544                      * possible strategy would be to save the tail of the
12545                      * string, and the next time regatom is called, initialize
12546                      * with that.  The problem with this is that unless you
12547                      * back off one more character, you won't be guaranteed
12548                      * regatom will get called again, unless regbranch,
12549                      * regpiece ... are also changed.  If you do back off that
12550                      * extra character, so that there is input guaranteed to
12551                      * force calling regatom, you can't handle the case where
12552                      * just the first character in the node is acceptable.  I
12553                      * (khw) decided to try this method which doesn't have that
12554                      * pitfall; if performance issues are found, we can do a
12555                      * combination of the current approach plus that one */
12556                     upper_parse = len;
12557                     len = 0;
12558                     s = s0;
12559                     goto reparse;
12560                 }
12561             }   /* End of verifying node ends with an appropriate char */
12562
12563         loopdone:   /* Jumped to when encounters something that shouldn't be in
12564                        the node */
12565
12566             /* I (khw) don't know if you can get here with zero length, but the
12567              * old code handled this situation by creating a zero-length EXACT
12568              * node.  Might as well be NOTHING instead */
12569             if (len == 0) {
12570                 OP(ret) = NOTHING;
12571             }
12572             else {
12573                 if (FOLD) {
12574                     /* If 'maybe_exact' is still set here, means there are no
12575                      * code points in the node that participate in folds;
12576                      * similarly for 'maybe_exactfu' and code points that match
12577                      * differently depending on UTF8ness of the target string
12578                      * (for /u), or depending on locale for /l */
12579                     if (maybe_exact) {
12580                         OP(ret) = EXACT;
12581                     }
12582                     else if (maybe_exactfu) {
12583                         OP(ret) = EXACTFU;
12584                     }
12585                 }
12586                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12587                                            FALSE /* Don't look to see if could
12588                                                     be turned into an EXACT
12589                                                     node, as we have already
12590                                                     computed that */
12591                                           );
12592             }
12593
12594             RExC_parse = p - 1;
12595             Set_Node_Cur_Length(ret, parse_start);
12596             nextchar(pRExC_state);
12597             {
12598                 /* len is STRLEN which is unsigned, need to copy to signed */
12599                 IV iv = len;
12600                 if (iv < 0)
12601                     vFAIL("Internal disaster");
12602             }
12603
12604         } /* End of label 'defchar:' */
12605         break;
12606     } /* End of giant switch on input character */
12607
12608     return(ret);
12609 }
12610
12611 STATIC char *
12612 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12613 {
12614     /* Returns the next non-pattern-white space, non-comment character (the
12615      * latter only if 'recognize_comment is true) in the string p, which is
12616      * ended by RExC_end.  See also reg_skipcomment */
12617     const char *e = RExC_end;
12618
12619     PERL_ARGS_ASSERT_REGPATWS;
12620
12621     while (p < e) {
12622         STRLEN len;
12623         if ((len = is_PATWS_safe(p, e, UTF))) {
12624             p += len;
12625         }
12626         else if (recognize_comment && *p == '#') {
12627             p = reg_skipcomment(pRExC_state, p);
12628         }
12629         else
12630             break;
12631     }
12632     return p;
12633 }
12634
12635 STATIC void
12636 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12637 {
12638     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12639      * sets up the bitmap and any flags, removing those code points from the
12640      * inversion list, setting it to NULL should it become completely empty */
12641
12642     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12643     assert(PL_regkind[OP(node)] == ANYOF);
12644
12645     ANYOF_BITMAP_ZERO(node);
12646     if (*invlist_ptr) {
12647
12648         /* This gets set if we actually need to modify things */
12649         bool change_invlist = FALSE;
12650
12651         UV start, end;
12652
12653         /* Start looking through *invlist_ptr */
12654         invlist_iterinit(*invlist_ptr);
12655         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12656             UV high;
12657             int i;
12658
12659             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12660                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12661             }
12662             else if (end >= NUM_ANYOF_CODE_POINTS) {
12663                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12664             }
12665
12666             /* Quit if are above what we should change */
12667             if (start >= NUM_ANYOF_CODE_POINTS) {
12668                 break;
12669             }
12670
12671             change_invlist = TRUE;
12672
12673             /* Set all the bits in the range, up to the max that we are doing */
12674             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12675                    ? end
12676                    : NUM_ANYOF_CODE_POINTS - 1;
12677             for (i = start; i <= (int) high; i++) {
12678                 if (! ANYOF_BITMAP_TEST(node, i)) {
12679                     ANYOF_BITMAP_SET(node, i);
12680                 }
12681             }
12682         }
12683         invlist_iterfinish(*invlist_ptr);
12684
12685         /* Done with loop; remove any code points that are in the bitmap from
12686          * *invlist_ptr; similarly for code points above the bitmap if we have
12687          * a flag to match all of them anyways */
12688         if (change_invlist) {
12689             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12690         }
12691         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12692             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12693         }
12694
12695         /* If have completely emptied it, remove it completely */
12696         if (_invlist_len(*invlist_ptr) == 0) {
12697             SvREFCNT_dec_NN(*invlist_ptr);
12698             *invlist_ptr = NULL;
12699         }
12700     }
12701 }
12702
12703 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12704    Character classes ([:foo:]) can also be negated ([:^foo:]).
12705    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12706    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12707    but trigger failures because they are currently unimplemented. */
12708
12709 #define POSIXCC_DONE(c)   ((c) == ':')
12710 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12711 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12712
12713 PERL_STATIC_INLINE I32
12714 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12715 {
12716     I32 namedclass = OOB_NAMEDCLASS;
12717
12718     PERL_ARGS_ASSERT_REGPPOSIXCC;
12719
12720     if (value == '[' && RExC_parse + 1 < RExC_end &&
12721         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12722         POSIXCC(UCHARAT(RExC_parse)))
12723     {
12724         const char c = UCHARAT(RExC_parse);
12725         char* const s = RExC_parse++;
12726
12727         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12728             RExC_parse++;
12729         if (RExC_parse == RExC_end) {
12730             if (strict) {
12731
12732                 /* Try to give a better location for the error (than the end of
12733                  * the string) by looking for the matching ']' */
12734                 RExC_parse = s;
12735                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12736                     RExC_parse++;
12737                 }
12738                 vFAIL2("Unmatched '%c' in POSIX class", c);
12739             }
12740             /* Grandfather lone [:, [=, [. */
12741             RExC_parse = s;
12742         }
12743         else {
12744             const char* const t = RExC_parse++; /* skip over the c */
12745             assert(*t == c);
12746
12747             if (UCHARAT(RExC_parse) == ']') {
12748                 const char *posixcc = s + 1;
12749                 RExC_parse++; /* skip over the ending ] */
12750
12751                 if (*s == ':') {
12752                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12753                     const I32 skip = t - posixcc;
12754
12755                     /* Initially switch on the length of the name.  */
12756                     switch (skip) {
12757                     case 4:
12758                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12759                                                           this is the Perl \w
12760                                                         */
12761                             namedclass = ANYOF_WORDCHAR;
12762                         break;
12763                     case 5:
12764                         /* Names all of length 5.  */
12765                         /* alnum alpha ascii blank cntrl digit graph lower
12766                            print punct space upper  */
12767                         /* Offset 4 gives the best switch position.  */
12768                         switch (posixcc[4]) {
12769                         case 'a':
12770                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12771                                 namedclass = ANYOF_ALPHA;
12772                             break;
12773                         case 'e':
12774                             if (memEQ(posixcc, "spac", 4)) /* space */
12775                                 namedclass = ANYOF_PSXSPC;
12776                             break;
12777                         case 'h':
12778                             if (memEQ(posixcc, "grap", 4)) /* graph */
12779                                 namedclass = ANYOF_GRAPH;
12780                             break;
12781                         case 'i':
12782                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12783                                 namedclass = ANYOF_ASCII;
12784                             break;
12785                         case 'k':
12786                             if (memEQ(posixcc, "blan", 4)) /* blank */
12787                                 namedclass = ANYOF_BLANK;
12788                             break;
12789                         case 'l':
12790                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12791                                 namedclass = ANYOF_CNTRL;
12792                             break;
12793                         case 'm':
12794                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12795                                 namedclass = ANYOF_ALPHANUMERIC;
12796                             break;
12797                         case 'r':
12798                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12799                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12800                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12801                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12802                             break;
12803                         case 't':
12804                             if (memEQ(posixcc, "digi", 4)) /* digit */
12805                                 namedclass = ANYOF_DIGIT;
12806                             else if (memEQ(posixcc, "prin", 4)) /* print */
12807                                 namedclass = ANYOF_PRINT;
12808                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12809                                 namedclass = ANYOF_PUNCT;
12810                             break;
12811                         }
12812                         break;
12813                     case 6:
12814                         if (memEQ(posixcc, "xdigit", 6))
12815                             namedclass = ANYOF_XDIGIT;
12816                         break;
12817                     }
12818
12819                     if (namedclass == OOB_NAMEDCLASS)
12820                         vFAIL2utf8f(
12821                             "POSIX class [:%"UTF8f":] unknown",
12822                             UTF8fARG(UTF, t - s - 1, s + 1));
12823
12824                     /* The #defines are structured so each complement is +1 to
12825                      * the normal one */
12826                     if (complement) {
12827                         namedclass++;
12828                     }
12829                     assert (posixcc[skip] == ':');
12830                     assert (posixcc[skip+1] == ']');
12831                 } else if (!SIZE_ONLY) {
12832                     /* [[=foo=]] and [[.foo.]] are still future. */
12833
12834                     /* adjust RExC_parse so the warning shows after
12835                        the class closes */
12836                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12837                         RExC_parse++;
12838                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12839                 }
12840             } else {
12841                 /* Maternal grandfather:
12842                  * "[:" ending in ":" but not in ":]" */
12843                 if (strict) {
12844                     vFAIL("Unmatched '[' in POSIX class");
12845                 }
12846
12847                 /* Grandfather lone [:, [=, [. */
12848                 RExC_parse = s;
12849             }
12850         }
12851     }
12852
12853     return namedclass;
12854 }
12855
12856 STATIC bool
12857 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12858 {
12859     /* This applies some heuristics at the current parse position (which should
12860      * be at a '[') to see if what follows might be intended to be a [:posix:]
12861      * class.  It returns true if it really is a posix class, of course, but it
12862      * also can return true if it thinks that what was intended was a posix
12863      * class that didn't quite make it.
12864      *
12865      * It will return true for
12866      *      [:alphanumerics:
12867      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12868      *                         ')' indicating the end of the (?[
12869      *      [:any garbage including %^&$ punctuation:]
12870      *
12871      * This is designed to be called only from S_handle_regex_sets; it could be
12872      * easily adapted to be called from the spot at the beginning of regclass()
12873      * that checks to see in a normal bracketed class if the surrounding []
12874      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12875      * change long-standing behavior, so I (khw) didn't do that */
12876     char* p = RExC_parse + 1;
12877     char first_char = *p;
12878
12879     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12880
12881     assert(*(p - 1) == '[');
12882
12883     if (! POSIXCC(first_char)) {
12884         return FALSE;
12885     }
12886
12887     p++;
12888     while (p < RExC_end && isWORDCHAR(*p)) p++;
12889
12890     if (p >= RExC_end) {
12891         return FALSE;
12892     }
12893
12894     if (p - RExC_parse > 2    /* Got at least 1 word character */
12895         && (*p == first_char
12896             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12897     {
12898         return TRUE;
12899     }
12900
12901     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12902
12903     return (p
12904             && p - RExC_parse > 2 /* [:] evaluates to colon;
12905                                       [::] is a bad posix class. */
12906             && first_char == *(p - 1));
12907 }
12908
12909 STATIC regnode *
12910 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12911                     I32 *flagp, U32 depth,
12912                     char * const oregcomp_parse)
12913 {
12914     /* Handle the (?[...]) construct to do set operations */
12915
12916     U8 curchar;
12917     UV start, end;      /* End points of code point ranges */
12918     SV* result_string;
12919     char *save_end, *save_parse;
12920     SV* final;
12921     STRLEN len;
12922     regnode* node;
12923     AV* stack;
12924     const bool save_fold = FOLD;
12925
12926     GET_RE_DEBUG_FLAGS_DECL;
12927
12928     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12929
12930     if (LOC) {
12931         vFAIL("(?[...]) not valid in locale");
12932     }
12933     RExC_uni_semantics = 1;
12934
12935     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12936      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12937      * call regclass to handle '[]' so as to not have to reinvent its parsing
12938      * rules here (throwing away the size it computes each time).  And, we exit
12939      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12940      * these things, we need to realize that something preceded by a backslash
12941      * is escaped, so we have to keep track of backslashes */
12942     if (PASS2) {
12943         Perl_ck_warner_d(aTHX_
12944             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12945             "The regex_sets feature is experimental" REPORT_LOCATION,
12946                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12947                 UTF8fARG(UTF,
12948                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12949                          RExC_precomp + (RExC_parse - RExC_precomp)));
12950     }
12951     else {
12952         UV depth = 0; /* how many nested (?[...]) constructs */
12953
12954         while (RExC_parse < RExC_end) {
12955             SV* current = NULL;
12956             RExC_parse = regpatws(pRExC_state, RExC_parse,
12957                                           TRUE); /* means recognize comments */
12958             switch (*RExC_parse) {
12959                 case '?':
12960                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12961                     /* FALLTHROUGH */
12962                 default:
12963                     break;
12964                 case '\\':
12965                     /* Skip the next byte (which could cause us to end up in
12966                      * the middle of a UTF-8 character, but since none of those
12967                      * are confusable with anything we currently handle in this
12968                      * switch (invariants all), it's safe.  We'll just hit the
12969                      * default: case next time and keep on incrementing until
12970                      * we find one of the invariants we do handle. */
12971                     RExC_parse++;
12972                     break;
12973                 case '[':
12974                 {
12975                     /* If this looks like it is a [:posix:] class, leave the
12976                      * parse pointer at the '[' to fool regclass() into
12977                      * thinking it is part of a '[[:posix:]]'.  That function
12978                      * will use strict checking to force a syntax error if it
12979                      * doesn't work out to a legitimate class */
12980                     bool is_posix_class
12981                                     = could_it_be_a_POSIX_class(pRExC_state);
12982                     if (! is_posix_class) {
12983                         RExC_parse++;
12984                     }
12985
12986                     /* regclass() can only return RESTART_UTF8 if multi-char
12987                        folds are allowed.  */
12988                     if (!regclass(pRExC_state, flagp,depth+1,
12989                                   is_posix_class, /* parse the whole char
12990                                                      class only if not a
12991                                                      posix class */
12992                                   FALSE, /* don't allow multi-char folds */
12993                                   TRUE, /* silence non-portable warnings. */
12994                                   &current))
12995                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12996                               (UV) *flagp);
12997
12998                     /* function call leaves parse pointing to the ']', except
12999                      * if we faked it */
13000                     if (is_posix_class) {
13001                         RExC_parse--;
13002                     }
13003
13004                     SvREFCNT_dec(current);   /* In case it returned something */
13005                     break;
13006                 }
13007
13008                 case ']':
13009                     if (depth--) break;
13010                     RExC_parse++;
13011                     if (RExC_parse < RExC_end
13012                         && *RExC_parse == ')')
13013                     {
13014                         node = reganode(pRExC_state, ANYOF, 0);
13015                         RExC_size += ANYOF_SKIP;
13016                         nextchar(pRExC_state);
13017                         Set_Node_Length(node,
13018                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13019                         return node;
13020                     }
13021                     goto no_close;
13022             }
13023             RExC_parse++;
13024         }
13025
13026         no_close:
13027         FAIL("Syntax error in (?[...])");
13028     }
13029
13030     /* Pass 2 only after this.  Everything in this construct is a
13031      * metacharacter.  Operands begin with either a '\' (for an escape
13032      * sequence), or a '[' for a bracketed character class.  Any other
13033      * character should be an operator, or parenthesis for grouping.  Both
13034      * types of operands are handled by calling regclass() to parse them.  It
13035      * is called with a parameter to indicate to return the computed inversion
13036      * list.  The parsing here is implemented via a stack.  Each entry on the
13037      * stack is a single character representing one of the operators, or the
13038      * '('; or else a pointer to an operand inversion list. */
13039
13040 #define IS_OPERAND(a)  (! SvIOK(a))
13041
13042     /* The stack starts empty.  It is a syntax error if the first thing parsed
13043      * is a binary operator; everything else is pushed on the stack.  When an
13044      * operand is parsed, the top of the stack is examined.  If it is a binary
13045      * operator, the item before it should be an operand, and both are replaced
13046      * by the result of doing that operation on the new operand and the one on
13047      * the stack.   Thus a sequence of binary operands is reduced to a single
13048      * one before the next one is parsed.
13049      *
13050      * A unary operator may immediately follow a binary in the input, for
13051      * example
13052      *      [a] + ! [b]
13053      * When an operand is parsed and the top of the stack is a unary operator,
13054      * the operation is performed, and then the stack is rechecked to see if
13055      * this new operand is part of a binary operation; if so, it is handled as
13056      * above.
13057      *
13058      * A '(' is simply pushed on the stack; it is valid only if the stack is
13059      * empty, or the top element of the stack is an operator or another '('
13060      * (for which the parenthesized expression will become an operand).  By the
13061      * time the corresponding ')' is parsed everything in between should have
13062      * been parsed and evaluated to a single operand (or else is a syntax
13063      * error), and is handled as a regular operand */
13064
13065     sv_2mortal((SV *)(stack = newAV()));
13066
13067     while (RExC_parse < RExC_end) {
13068         I32 top_index = av_tindex(stack);
13069         SV** top_ptr;
13070         SV* current = NULL;
13071
13072         /* Skip white space */
13073         RExC_parse = regpatws(pRExC_state, RExC_parse,
13074                                          TRUE /* means recognize comments */ );
13075         if (RExC_parse >= RExC_end) {
13076             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13077         }
13078         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13079             break;
13080         }
13081
13082         switch (curchar) {
13083
13084             case '?':
13085                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13086                                                safely subtract 1 from
13087                                                RExC_parse in the next clause.
13088                                                If we have something on the
13089                                                stack, we have parsed something
13090                                              */
13091                     && UCHARAT(RExC_parse - 1) == '('
13092                     && RExC_parse < RExC_end)
13093                 {
13094                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13095                      * This happens when we have some thing like
13096                      *
13097                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13098                      *   ...
13099                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13100                      *
13101                      * Here we would be handling the interpolated
13102                      * '$thai_or_lao'.  We handle this by a recursive call to
13103                      * ourselves which returns the inversion list the
13104                      * interpolated expression evaluates to.  We use the flags
13105                      * from the interpolated pattern. */
13106                     U32 save_flags = RExC_flags;
13107                     const char * const save_parse = ++RExC_parse;
13108
13109                     parse_lparen_question_flags(pRExC_state);
13110
13111                     if (RExC_parse == save_parse  /* Makes sure there was at
13112                                                      least one flag (or this
13113                                                      embedding wasn't compiled)
13114                                                    */
13115                         || RExC_parse >= RExC_end - 4
13116                         || UCHARAT(RExC_parse) != ':'
13117                         || UCHARAT(++RExC_parse) != '('
13118                         || UCHARAT(++RExC_parse) != '?'
13119                         || UCHARAT(++RExC_parse) != '[')
13120                     {
13121
13122                         /* In combination with the above, this moves the
13123                          * pointer to the point just after the first erroneous
13124                          * character (or if there are no flags, to where they
13125                          * should have been) */
13126                         if (RExC_parse >= RExC_end - 4) {
13127                             RExC_parse = RExC_end;
13128                         }
13129                         else if (RExC_parse != save_parse) {
13130                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13131                         }
13132                         vFAIL("Expecting '(?flags:(?[...'");
13133                     }
13134                     RExC_parse++;
13135                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13136                                                     depth+1, oregcomp_parse);
13137
13138                     /* Here, 'current' contains the embedded expression's
13139                      * inversion list, and RExC_parse points to the trailing
13140                      * ']'; the next character should be the ')' which will be
13141                      * paired with the '(' that has been put on the stack, so
13142                      * the whole embedded expression reduces to '(operand)' */
13143                     RExC_parse++;
13144
13145                     RExC_flags = save_flags;
13146                     goto handle_operand;
13147                 }
13148                 /* FALLTHROUGH */
13149
13150             default:
13151                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13152                 vFAIL("Unexpected character");
13153
13154             case '\\':
13155                 /* regclass() can only return RESTART_UTF8 if multi-char
13156                    folds are allowed.  */
13157                 if (!regclass(pRExC_state, flagp,depth+1,
13158                               TRUE, /* means parse just the next thing */
13159                               FALSE, /* don't allow multi-char folds */
13160                               FALSE, /* don't silence non-portable warnings.  */
13161                               &current))
13162                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13163                           (UV) *flagp);
13164                 /* regclass() will return with parsing just the \ sequence,
13165                  * leaving the parse pointer at the next thing to parse */
13166                 RExC_parse--;
13167                 goto handle_operand;
13168
13169             case '[':   /* Is a bracketed character class */
13170             {
13171                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13172
13173                 if (! is_posix_class) {
13174                     RExC_parse++;
13175                 }
13176
13177                 /* regclass() can only return RESTART_UTF8 if multi-char
13178                    folds are allowed.  */
13179                 if(!regclass(pRExC_state, flagp,depth+1,
13180                              is_posix_class, /* parse the whole char class
13181                                                 only if not a posix class */
13182                              FALSE, /* don't allow multi-char folds */
13183                              FALSE, /* don't silence non-portable warnings.  */
13184                              &current))
13185                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13186                           (UV) *flagp);
13187                 /* function call leaves parse pointing to the ']', except if we
13188                  * faked it */
13189                 if (is_posix_class) {
13190                     RExC_parse--;
13191                 }
13192
13193                 goto handle_operand;
13194             }
13195
13196             case '&':
13197             case '|':
13198             case '+':
13199             case '-':
13200             case '^':
13201                 if (top_index < 0
13202                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13203                     || ! IS_OPERAND(*top_ptr))
13204                 {
13205                     RExC_parse++;
13206                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13207                 }
13208                 av_push(stack, newSVuv(curchar));
13209                 break;
13210
13211             case '!':
13212                 av_push(stack, newSVuv(curchar));
13213                 break;
13214
13215             case '(':
13216                 if (top_index >= 0) {
13217                     top_ptr = av_fetch(stack, top_index, FALSE);
13218                     assert(top_ptr);
13219                     if (IS_OPERAND(*top_ptr)) {
13220                         RExC_parse++;
13221                         vFAIL("Unexpected '(' with no preceding operator");
13222                     }
13223                 }
13224                 av_push(stack, newSVuv(curchar));
13225                 break;
13226
13227             case ')':
13228             {
13229                 SV* lparen;
13230                 if (top_index < 1
13231                     || ! (current = av_pop(stack))
13232                     || ! IS_OPERAND(current)
13233                     || ! (lparen = av_pop(stack))
13234                     || IS_OPERAND(lparen)
13235                     || SvUV(lparen) != '(')
13236                 {
13237                     SvREFCNT_dec(current);
13238                     RExC_parse++;
13239                     vFAIL("Unexpected ')'");
13240                 }
13241                 top_index -= 2;
13242                 SvREFCNT_dec_NN(lparen);
13243
13244                 /* FALLTHROUGH */
13245             }
13246
13247               handle_operand:
13248
13249                 /* Here, we have an operand to process, in 'current' */
13250
13251                 if (top_index < 0) {    /* Just push if stack is empty */
13252                     av_push(stack, current);
13253                 }
13254                 else {
13255                     SV* top = av_pop(stack);
13256                     SV *prev = NULL;
13257                     char current_operator;
13258
13259                     if (IS_OPERAND(top)) {
13260                         SvREFCNT_dec_NN(top);
13261                         SvREFCNT_dec_NN(current);
13262                         vFAIL("Operand with no preceding operator");
13263                     }
13264                     current_operator = (char) SvUV(top);
13265                     switch (current_operator) {
13266                         case '(':   /* Push the '(' back on followed by the new
13267                                        operand */
13268                             av_push(stack, top);
13269                             av_push(stack, current);
13270                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13271                                                    just after the 'break', so
13272                                                    it doesn't get wrongly freed
13273                                                  */
13274                             break;
13275
13276                         case '!':
13277                             _invlist_invert(current);
13278
13279                             /* Unlike binary operators, the top of the stack,
13280                              * now that this unary one has been popped off, may
13281                              * legally be an operator, and we now have operand
13282                              * for it. */
13283                             top_index--;
13284                             SvREFCNT_dec_NN(top);
13285                             goto handle_operand;
13286
13287                         case '&':
13288                             prev = av_pop(stack);
13289                             _invlist_intersection(prev,
13290                                                    current,
13291                                                    &current);
13292                             av_push(stack, current);
13293                             break;
13294
13295                         case '|':
13296                         case '+':
13297                             prev = av_pop(stack);
13298                             _invlist_union(prev, current, &current);
13299                             av_push(stack, current);
13300                             break;
13301
13302                         case '-':
13303                             prev = av_pop(stack);;
13304                             _invlist_subtract(prev, current, &current);
13305                             av_push(stack, current);
13306                             break;
13307
13308                         case '^':   /* The union minus the intersection */
13309                         {
13310                             SV* i = NULL;
13311                             SV* u = NULL;
13312                             SV* element;
13313
13314                             prev = av_pop(stack);
13315                             _invlist_union(prev, current, &u);
13316                             _invlist_intersection(prev, current, &i);
13317                             /* _invlist_subtract will overwrite current
13318                                 without freeing what it already contains */
13319                             element = current;
13320                             _invlist_subtract(u, i, &current);
13321                             av_push(stack, current);
13322                             SvREFCNT_dec_NN(i);
13323                             SvREFCNT_dec_NN(u);
13324                             SvREFCNT_dec_NN(element);
13325                             break;
13326                         }
13327
13328                         default:
13329                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13330                 }
13331                 SvREFCNT_dec_NN(top);
13332                 SvREFCNT_dec(prev);
13333             }
13334         }
13335
13336         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13337     }
13338
13339     if (av_tindex(stack) < 0   /* Was empty */
13340         || ((final = av_pop(stack)) == NULL)
13341         || ! IS_OPERAND(final)
13342         || av_tindex(stack) >= 0)  /* More left on stack */
13343     {
13344         vFAIL("Incomplete expression within '(?[ ])'");
13345     }
13346
13347     /* Here, 'final' is the resultant inversion list from evaluating the
13348      * expression.  Return it if so requested */
13349     if (return_invlist) {
13350         *return_invlist = final;
13351         return END;
13352     }
13353
13354     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13355      * expecting a string of ranges and individual code points */
13356     invlist_iterinit(final);
13357     result_string = newSVpvs("");
13358     while (invlist_iternext(final, &start, &end)) {
13359         if (start == end) {
13360             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13361         }
13362         else {
13363             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13364                                                      start,          end);
13365         }
13366     }
13367
13368     save_parse = RExC_parse;
13369     RExC_parse = SvPV(result_string, len);
13370     save_end = RExC_end;
13371     RExC_end = RExC_parse + len;
13372
13373     /* We turn off folding around the call, as the class we have constructed
13374      * already has all folding taken into consideration, and we don't want
13375      * regclass() to add to that */
13376     RExC_flags &= ~RXf_PMf_FOLD;
13377     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13378      */
13379     node = regclass(pRExC_state, flagp,depth+1,
13380                     FALSE, /* means parse the whole char class */
13381                     FALSE, /* don't allow multi-char folds */
13382                     TRUE, /* silence non-portable warnings.  The above may very
13383                              well have generated non-portable code points, but
13384                              they're valid on this machine */
13385                     NULL);
13386     if (!node)
13387         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13388                     PTR2UV(flagp));
13389     if (save_fold) {
13390         RExC_flags |= RXf_PMf_FOLD;
13391     }
13392     RExC_parse = save_parse + 1;
13393     RExC_end = save_end;
13394     SvREFCNT_dec_NN(final);
13395     SvREFCNT_dec_NN(result_string);
13396
13397     nextchar(pRExC_state);
13398     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13399     return node;
13400 }
13401 #undef IS_OPERAND
13402
13403 STATIC void
13404 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13405 {
13406     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13407      * innocent-looking character class, like /[ks]/i won't have to go out to
13408      * disk to find the possible matches.
13409      *
13410      * This should be called only for a Latin1-range code points, cp, which is
13411      * known to be involved in a simple fold with other code points above
13412      * Latin1.  It would give false results if /aa has been specified.
13413      * Multi-char folds are outside the scope of this, and must be handled
13414      * specially.
13415      *
13416      * XXX It would be better to generate these via regen, in case a new
13417      * version of the Unicode standard adds new mappings, though that is not
13418      * really likely, and may be caught by the default: case of the switch
13419      * below. */
13420
13421     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13422
13423     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13424
13425     switch (cp) {
13426         case 'k':
13427         case 'K':
13428           *invlist =
13429              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13430             break;
13431         case 's':
13432         case 'S':
13433           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13434             break;
13435         case MICRO_SIGN:
13436           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13437           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13438             break;
13439         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13440         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13441           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13442             break;
13443         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13444           *invlist = add_cp_to_invlist(*invlist,
13445                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13446             break;
13447         case LATIN_SMALL_LETTER_SHARP_S:
13448           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13449             break;
13450         default:
13451             /* Use deprecated warning to increase the chances of this being
13452              * output */
13453             if (PASS2) {
13454                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13455             }
13456             break;
13457     }
13458 }
13459
13460 STATIC AV *
13461 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13462 {
13463     /* This adds the string scalar <multi_string> to the array
13464      * <multi_char_matches>.  <multi_string> is known to have exactly
13465      * <cp_count> code points in it.  This is used when constructing a
13466      * bracketed character class and we find something that needs to match more
13467      * than a single character.
13468      *
13469      * <multi_char_matches> is actually an array of arrays.  Each top-level
13470      * element is an array that contains all the strings known so far that are
13471      * the same length.  And that length (in number of code points) is the same
13472      * as the index of the top-level array.  Hence, the [2] element is an
13473      * array, each element thereof is a string containing TWO code points;
13474      * while element [3] is for strings of THREE characters, and so on.  Since
13475      * this is for multi-char strings there can never be a [0] nor [1] element.
13476      *
13477      * When we rewrite the character class below, we will do so such that the
13478      * longest strings are written first, so that it prefers the longest
13479      * matching strings first.  This is done even if it turns out that any
13480      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13481      * Christiansen has agreed that this is ok.  This makes the test for the
13482      * ligature 'ffi' come before the test for 'ff', for example */
13483
13484     AV* this_array;
13485     AV** this_array_ptr;
13486
13487     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13488
13489     if (! multi_char_matches) {
13490         multi_char_matches = newAV();
13491     }
13492
13493     if (av_exists(multi_char_matches, cp_count)) {
13494         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13495         this_array = *this_array_ptr;
13496     }
13497     else {
13498         this_array = newAV();
13499         av_store(multi_char_matches, cp_count,
13500                  (SV*) this_array);
13501     }
13502     av_push(this_array, multi_string);
13503
13504     return multi_char_matches;
13505 }
13506
13507 /* The names of properties whose definitions are not known at compile time are
13508  * stored in this SV, after a constant heading.  So if the length has been
13509  * changed since initialization, then there is a run-time definition. */
13510 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13511                                         (SvCUR(listsv) != initial_listsv_len)
13512
13513 STATIC regnode *
13514 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13515                  const bool stop_at_1,  /* Just parse the next thing, don't
13516                                            look for a full character class */
13517                  bool allow_multi_folds,
13518                  const bool silence_non_portable,   /* Don't output warnings
13519                                                        about too large
13520                                                        characters */
13521                  SV** ret_invlist)  /* Return an inversion list, not a node */
13522 {
13523     /* parse a bracketed class specification.  Most of these will produce an
13524      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13525      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13526      * under /i with multi-character folds: it will be rewritten following the
13527      * paradigm of this example, where the <multi-fold>s are characters which
13528      * fold to multiple character sequences:
13529      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13530      * gets effectively rewritten as:
13531      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13532      * reg() gets called (recursively) on the rewritten version, and this
13533      * function will return what it constructs.  (Actually the <multi-fold>s
13534      * aren't physically removed from the [abcdefghi], it's just that they are
13535      * ignored in the recursion by means of a flag:
13536      * <RExC_in_multi_char_class>.)
13537      *
13538      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13539      * characters, with the corresponding bit set if that character is in the
13540      * list.  For characters above this, a range list or swash is used.  There
13541      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13542      * determinable at compile time
13543      *
13544      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13545      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13546      */
13547
13548     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13549     IV range = 0;
13550     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13551     regnode *ret;
13552     STRLEN numlen;
13553     IV namedclass = OOB_NAMEDCLASS;
13554     char *rangebegin = NULL;
13555     bool need_class = 0;
13556     SV *listsv = NULL;
13557     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13558                                       than just initialized.  */
13559     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13560     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13561                                extended beyond the Latin1 range.  These have to
13562                                be kept separate from other code points for much
13563                                of this function because their handling  is
13564                                different under /i, and for most classes under
13565                                /d as well */
13566     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13567                                separate for a while from the non-complemented
13568                                versions because of complications with /d
13569                                matching */
13570     UV element_count = 0;   /* Number of distinct elements in the class.
13571                                Optimizations may be possible if this is tiny */
13572     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13573                                        character; used under /i */
13574     UV n;
13575     char * stop_ptr = RExC_end;    /* where to stop parsing */
13576     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13577                                                    space? */
13578     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13579
13580     /* Unicode properties are stored in a swash; this holds the current one
13581      * being parsed.  If this swash is the only above-latin1 component of the
13582      * character class, an optimization is to pass it directly on to the
13583      * execution engine.  Otherwise, it is set to NULL to indicate that there
13584      * are other things in the class that have to be dealt with at execution
13585      * time */
13586     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13587
13588     /* Set if a component of this character class is user-defined; just passed
13589      * on to the engine */
13590     bool has_user_defined_property = FALSE;
13591
13592     /* inversion list of code points this node matches only when the target
13593      * string is in UTF-8.  (Because is under /d) */
13594     SV* depends_list = NULL;
13595
13596     /* Inversion list of code points this node matches regardless of things
13597      * like locale, folding, utf8ness of the target string */
13598     SV* cp_list = NULL;
13599
13600     /* Like cp_list, but code points on this list need to be checked for things
13601      * that fold to/from them under /i */
13602     SV* cp_foldable_list = NULL;
13603
13604     /* Like cp_list, but code points on this list are valid only when the
13605      * runtime locale is UTF-8 */
13606     SV* only_utf8_locale_list = NULL;
13607
13608 #ifdef EBCDIC
13609     /* In a range, counts how many 0-2 of the ends of it came from literals,
13610      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13611     UV literal_endpoint = 0;
13612 #endif
13613     bool invert = FALSE;    /* Is this class to be complemented */
13614
13615     bool warn_super = ALWAYS_WARN_SUPER;
13616
13617     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13618         case we need to change the emitted regop to an EXACT. */
13619     const char * orig_parse = RExC_parse;
13620     const SSize_t orig_size = RExC_size;
13621     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13622     GET_RE_DEBUG_FLAGS_DECL;
13623
13624     PERL_ARGS_ASSERT_REGCLASS;
13625 #ifndef DEBUGGING
13626     PERL_UNUSED_ARG(depth);
13627 #endif
13628
13629     DEBUG_PARSE("clas");
13630
13631     /* Assume we are going to generate an ANYOF node. */
13632     ret = reganode(pRExC_state, ANYOF, 0);
13633
13634     if (SIZE_ONLY) {
13635         RExC_size += ANYOF_SKIP;
13636         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13637     }
13638     else {
13639         ANYOF_FLAGS(ret) = 0;
13640
13641         RExC_emit += ANYOF_SKIP;
13642         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13643         initial_listsv_len = SvCUR(listsv);
13644         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13645     }
13646
13647     if (skip_white) {
13648         RExC_parse = regpatws(pRExC_state, RExC_parse,
13649                               FALSE /* means don't recognize comments */ );
13650     }
13651
13652     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13653         RExC_parse++;
13654         invert = TRUE;
13655         allow_multi_folds = FALSE;
13656         RExC_naughty++;
13657         if (skip_white) {
13658             RExC_parse = regpatws(pRExC_state, RExC_parse,
13659                                   FALSE /* means don't recognize comments */ );
13660         }
13661     }
13662
13663     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13664     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13665         const char *s = RExC_parse;
13666         const char  c = *s++;
13667
13668         while (isWORDCHAR(*s))
13669             s++;
13670         if (*s && c == *s && s[1] == ']') {
13671             SAVEFREESV(RExC_rx_sv);
13672             ckWARN3reg(s+2,
13673                        "POSIX syntax [%c %c] belongs inside character classes",
13674                        c, c);
13675             (void)ReREFCNT_inc(RExC_rx_sv);
13676         }
13677     }
13678
13679     /* If the caller wants us to just parse a single element, accomplish this
13680      * by faking the loop ending condition */
13681     if (stop_at_1 && RExC_end > RExC_parse) {
13682         stop_ptr = RExC_parse + 1;
13683     }
13684
13685     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13686     if (UCHARAT(RExC_parse) == ']')
13687         goto charclassloop;
13688
13689     while (1) {
13690         if  (RExC_parse >= stop_ptr) {
13691             break;
13692         }
13693
13694         if (skip_white) {
13695             RExC_parse = regpatws(pRExC_state, RExC_parse,
13696                                   FALSE /* means don't recognize comments */ );
13697         }
13698
13699         if  (UCHARAT(RExC_parse) == ']') {
13700             break;
13701         }
13702
13703     charclassloop:
13704
13705         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13706         save_value = value;
13707         save_prevvalue = prevvalue;
13708
13709         if (!range) {
13710             rangebegin = RExC_parse;
13711             element_count++;
13712         }
13713         if (UTF) {
13714             value = utf8n_to_uvchr((U8*)RExC_parse,
13715                                    RExC_end - RExC_parse,
13716                                    &numlen, UTF8_ALLOW_DEFAULT);
13717             RExC_parse += numlen;
13718         }
13719         else
13720             value = UCHARAT(RExC_parse++);
13721
13722         if (value == '['
13723             && RExC_parse < RExC_end
13724             && POSIXCC(UCHARAT(RExC_parse)))
13725         {
13726             namedclass = regpposixcc(pRExC_state, value, strict);
13727         }
13728         else if (value != '\\') {
13729 #ifdef EBCDIC
13730             literal_endpoint++;
13731 #endif
13732         }
13733         else {
13734             /* Is a backslash; get the code point of the char after it */
13735             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13736                 value = utf8n_to_uvchr((U8*)RExC_parse,
13737                                    RExC_end - RExC_parse,
13738                                    &numlen, UTF8_ALLOW_DEFAULT);
13739                 RExC_parse += numlen;
13740             }
13741             else
13742                 value = UCHARAT(RExC_parse++);
13743
13744             /* Some compilers cannot handle switching on 64-bit integer
13745              * values, therefore value cannot be an UV.  Yes, this will
13746              * be a problem later if we want switch on Unicode.
13747              * A similar issue a little bit later when switching on
13748              * namedclass. --jhi */
13749
13750             /* If the \ is escaping white space when white space is being
13751              * skipped, it means that that white space is wanted literally, and
13752              * is already in 'value'.  Otherwise, need to translate the escape
13753              * into what it signifies. */
13754             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13755
13756             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13757             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13758             case 's':   namedclass = ANYOF_SPACE;       break;
13759             case 'S':   namedclass = ANYOF_NSPACE;      break;
13760             case 'd':   namedclass = ANYOF_DIGIT;       break;
13761             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13762             case 'v':   namedclass = ANYOF_VERTWS;      break;
13763             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13764             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13765             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13766             case 'N':  /* Handle \N{NAME} in class */
13767                 {
13768                     SV *as_text;
13769                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13770                                                     flagp, depth, &as_text);
13771                     if (*flagp & RESTART_UTF8)
13772                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13773                     if (cp_count != 1) {    /* The typical case drops through */
13774                         assert(cp_count != (STRLEN) -1);
13775                         if (cp_count == 0) {
13776                             if (strict) {
13777                                 RExC_parse++;   /* Position after the "}" */
13778                                 vFAIL("Zero length \\N{}");
13779                             }
13780                             else if (PASS2) {
13781                                 ckWARNreg(RExC_parse,
13782                                         "Ignoring zero length \\N{} in character class");
13783                             }
13784                         }
13785                         else { /* cp_count > 1 */
13786                             if (! RExC_in_multi_char_class) {
13787                                 if (invert || range || *RExC_parse == '-') {
13788                                     if (strict) {
13789                                         RExC_parse--;
13790                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13791                                     }
13792                                     else if (PASS2) {
13793                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13794                                     }
13795                                 }
13796                                 else {
13797                                     multi_char_matches
13798                                         = add_multi_match(multi_char_matches,
13799                                                           as_text,
13800                                                           cp_count);
13801                                 }
13802                                 break; /* <value> contains the first code
13803                                           point. Drop out of the switch to
13804                                           process it */
13805                             }
13806                         } /* End of cp_count != 1 */
13807
13808                         /* This element should not be processed further in this
13809                          * class */
13810                         element_count--;
13811                         value = save_value;
13812                         prevvalue = save_prevvalue;
13813                         continue;   /* Back to top of loop to get next char */
13814                     }
13815                     /* Here, is a single code point, and <value> contains it */
13816 #ifdef EBCDIC
13817                     /* We consider named characters to be literal characters */
13818                     literal_endpoint++;
13819 #endif
13820                 }
13821                 break;
13822             case 'p':
13823             case 'P':
13824                 {
13825                 char *e;
13826
13827                 /* We will handle any undefined properties ourselves */
13828                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13829                                        /* And we actually would prefer to get
13830                                         * the straight inversion list of the
13831                                         * swash, since we will be accessing it
13832                                         * anyway, to save a little time */
13833                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13834
13835                 if (RExC_parse >= RExC_end)
13836                     vFAIL2("Empty \\%c{}", (U8)value);
13837                 if (*RExC_parse == '{') {
13838                     const U8 c = (U8)value;
13839                     e = strchr(RExC_parse++, '}');
13840                     if (!e)
13841                         vFAIL2("Missing right brace on \\%c{}", c);
13842                     while (isSPACE(*RExC_parse))
13843                         RExC_parse++;
13844                     if (e == RExC_parse)
13845                         vFAIL2("Empty \\%c{}", c);
13846                     n = e - RExC_parse;
13847                     while (isSPACE(*(RExC_parse + n - 1)))
13848                         n--;
13849                 }
13850                 else {
13851                     e = RExC_parse;
13852                     n = 1;
13853                 }
13854                 if (!SIZE_ONLY) {
13855                     SV* invlist;
13856                     char* name;
13857
13858                     if (UCHARAT(RExC_parse) == '^') {
13859                          RExC_parse++;
13860                          n--;
13861                          /* toggle.  (The rhs xor gets the single bit that
13862                           * differs between P and p; the other xor inverts just
13863                           * that bit) */
13864                          value ^= 'P' ^ 'p';
13865
13866                          while (isSPACE(*RExC_parse)) {
13867                               RExC_parse++;
13868                               n--;
13869                          }
13870                     }
13871                     /* Try to get the definition of the property into
13872                      * <invlist>.  If /i is in effect, the effective property
13873                      * will have its name be <__NAME_i>.  The design is
13874                      * discussed in commit
13875                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13876                     name = savepv(Perl_form(aTHX_
13877                                           "%s%.*s%s\n",
13878                                           (FOLD) ? "__" : "",
13879                                           (int)n,
13880                                           RExC_parse,
13881                                           (FOLD) ? "_i" : ""
13882                                 ));
13883
13884                     /* Look up the property name, and get its swash and
13885                      * inversion list, if the property is found  */
13886                     if (swash) {
13887                         SvREFCNT_dec_NN(swash);
13888                     }
13889                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13890                                              1, /* binary */
13891                                              0, /* not tr/// */
13892                                              NULL, /* No inversion list */
13893                                              &swash_init_flags
13894                                             );
13895                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13896                         HV* curpkg = (IN_PERL_COMPILETIME)
13897                                       ? PL_curstash
13898                                       : CopSTASH(PL_curcop);
13899                         if (swash) {
13900                             SvREFCNT_dec_NN(swash);
13901                             swash = NULL;
13902                         }
13903
13904                         /* Here didn't find it.  It could be a user-defined
13905                          * property that will be available at run-time.  If we
13906                          * accept only compile-time properties, is an error;
13907                          * otherwise add it to the list for run-time look up */
13908                         if (ret_invlist) {
13909                             RExC_parse = e + 1;
13910                             vFAIL2utf8f(
13911                                 "Property '%"UTF8f"' is unknown",
13912                                 UTF8fARG(UTF, n, name));
13913                         }
13914
13915                         /* If the property name doesn't already have a package
13916                          * name, add the current one to it so that it can be
13917                          * referred to outside it. [perl #121777] */
13918                         if (curpkg && ! instr(name, "::")) {
13919                             char* pkgname = HvNAME(curpkg);
13920                             if (strNE(pkgname, "main")) {
13921                                 char* full_name = Perl_form(aTHX_
13922                                                             "%s::%s",
13923                                                             pkgname,
13924                                                             name);
13925                                 n = strlen(full_name);
13926                                 Safefree(name);
13927                                 name = savepvn(full_name, n);
13928                             }
13929                         }
13930                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13931                                         (value == 'p' ? '+' : '!'),
13932                                         UTF8fARG(UTF, n, name));
13933                         has_user_defined_property = TRUE;
13934
13935                         /* We don't know yet, so have to assume that the
13936                          * property could match something in the Latin1 range,
13937                          * hence something that isn't utf8.  Note that this
13938                          * would cause things in <depends_list> to match
13939                          * inappropriately, except that any \p{}, including
13940                          * this one forces Unicode semantics, which means there
13941                          * is no <depends_list> */
13942                         ANYOF_FLAGS(ret)
13943                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
13944                     }
13945                     else {
13946
13947                         /* Here, did get the swash and its inversion list.  If
13948                          * the swash is from a user-defined property, then this
13949                          * whole character class should be regarded as such */
13950                         if (swash_init_flags
13951                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13952                         {
13953                             has_user_defined_property = TRUE;
13954                         }
13955                         else if
13956                             /* We warn on matching an above-Unicode code point
13957                              * if the match would return true, except don't
13958                              * warn for \p{All}, which has exactly one element
13959                              * = 0 */
13960                             (_invlist_contains_cp(invlist, 0x110000)
13961                                 && (! (_invlist_len(invlist) == 1
13962                                        && *invlist_array(invlist) == 0)))
13963                         {
13964                             warn_super = TRUE;
13965                         }
13966
13967
13968                         /* Invert if asking for the complement */
13969                         if (value == 'P') {
13970                             _invlist_union_complement_2nd(properties,
13971                                                           invlist,
13972                                                           &properties);
13973
13974                             /* The swash can't be used as-is, because we've
13975                              * inverted things; delay removing it to here after
13976                              * have copied its invlist above */
13977                             SvREFCNT_dec_NN(swash);
13978                             swash = NULL;
13979                         }
13980                         else {
13981                             _invlist_union(properties, invlist, &properties);
13982                         }
13983                     }
13984                     Safefree(name);
13985                 }
13986                 RExC_parse = e + 1;
13987                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13988                                                 named */
13989
13990                 /* \p means they want Unicode semantics */
13991                 RExC_uni_semantics = 1;
13992                 }
13993                 break;
13994             case 'n':   value = '\n';                   break;
13995             case 'r':   value = '\r';                   break;
13996             case 't':   value = '\t';                   break;
13997             case 'f':   value = '\f';                   break;
13998             case 'b':   value = '\b';                   break;
13999             case 'e':   value = ESC_NATIVE;             break;
14000             case 'a':   value = '\a';                   break;
14001             case 'o':
14002                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14003                 {
14004                     const char* error_msg;
14005                     bool valid = grok_bslash_o(&RExC_parse,
14006                                                &value,
14007                                                &error_msg,
14008                                                PASS2,   /* warnings only in
14009                                                            pass 2 */
14010                                                strict,
14011                                                silence_non_portable,
14012                                                UTF);
14013                     if (! valid) {
14014                         vFAIL(error_msg);
14015                     }
14016                 }
14017                 if (PL_encoding && value < 0x100) {
14018                     goto recode_encoding;
14019                 }
14020                 break;
14021             case 'x':
14022                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14023                 {
14024                     const char* error_msg;
14025                     bool valid = grok_bslash_x(&RExC_parse,
14026                                                &value,
14027                                                &error_msg,
14028                                                PASS2, /* Output warnings */
14029                                                strict,
14030                                                silence_non_portable,
14031                                                UTF);
14032                     if (! valid) {
14033                         vFAIL(error_msg);
14034                     }
14035                 }
14036                 if (PL_encoding && value < 0x100)
14037                     goto recode_encoding;
14038                 break;
14039             case 'c':
14040                 value = grok_bslash_c(*RExC_parse++, PASS2);
14041                 break;
14042             case '0': case '1': case '2': case '3': case '4':
14043             case '5': case '6': case '7':
14044                 {
14045                     /* Take 1-3 octal digits */
14046                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14047                     numlen = (strict) ? 4 : 3;
14048                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14049                     RExC_parse += numlen;
14050                     if (numlen != 3) {
14051                         if (strict) {
14052                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14053                             vFAIL("Need exactly 3 octal digits");
14054                         }
14055                         else if (! SIZE_ONLY /* like \08, \178 */
14056                                  && numlen < 3
14057                                  && RExC_parse < RExC_end
14058                                  && isDIGIT(*RExC_parse)
14059                                  && ckWARN(WARN_REGEXP))
14060                         {
14061                             SAVEFREESV(RExC_rx_sv);
14062                             reg_warn_non_literal_string(
14063                                  RExC_parse + 1,
14064                                  form_short_octal_warning(RExC_parse, numlen));
14065                             (void)ReREFCNT_inc(RExC_rx_sv);
14066                         }
14067                     }
14068                     if (PL_encoding && value < 0x100)
14069                         goto recode_encoding;
14070                     break;
14071                 }
14072             recode_encoding:
14073                 if (! RExC_override_recoding) {
14074                     SV* enc = PL_encoding;
14075                     value = reg_recode((const char)(U8)value, &enc);
14076                     if (!enc) {
14077                         if (strict) {
14078                             vFAIL("Invalid escape in the specified encoding");
14079                         }
14080                         else if (PASS2) {
14081                             ckWARNreg(RExC_parse,
14082                                   "Invalid escape in the specified encoding");
14083                         }
14084                     }
14085                     break;
14086                 }
14087             default:
14088                 /* Allow \_ to not give an error */
14089                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14090                     if (strict) {
14091                         vFAIL2("Unrecognized escape \\%c in character class",
14092                                (int)value);
14093                     }
14094                     else {
14095                         SAVEFREESV(RExC_rx_sv);
14096                         ckWARN2reg(RExC_parse,
14097                             "Unrecognized escape \\%c in character class passed through",
14098                             (int)value);
14099                         (void)ReREFCNT_inc(RExC_rx_sv);
14100                     }
14101                 }
14102                 break;
14103             }   /* End of switch on char following backslash */
14104         } /* end of handling backslash escape sequences */
14105
14106         /* Here, we have the current token in 'value' */
14107
14108         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14109             U8 classnum;
14110
14111             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14112              * literal, as is the character that began the false range, i.e.
14113              * the 'a' in the examples */
14114             if (range) {
14115                 if (!SIZE_ONLY) {
14116                     const int w = (RExC_parse >= rangebegin)
14117                                   ? RExC_parse - rangebegin
14118                                   : 0;
14119                     if (strict) {
14120                         vFAIL2utf8f(
14121                             "False [] range \"%"UTF8f"\"",
14122                             UTF8fARG(UTF, w, rangebegin));
14123                     }
14124                     else {
14125                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14126                         ckWARN2reg(RExC_parse,
14127                             "False [] range \"%"UTF8f"\"",
14128                             UTF8fARG(UTF, w, rangebegin));
14129                         (void)ReREFCNT_inc(RExC_rx_sv);
14130                         cp_list = add_cp_to_invlist(cp_list, '-');
14131                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14132                                                              prevvalue);
14133                     }
14134                 }
14135
14136                 range = 0; /* this was not a true range */
14137                 element_count += 2; /* So counts for three values */
14138             }
14139
14140             classnum = namedclass_to_classnum(namedclass);
14141
14142             if (LOC && namedclass < ANYOF_POSIXL_MAX
14143 #ifndef HAS_ISASCII
14144                 && classnum != _CC_ASCII
14145 #endif
14146             ) {
14147                 /* What the Posix classes (like \w, [:space:]) match in locale
14148                  * isn't knowable under locale until actual match time.  Room
14149                  * must be reserved (one time per outer bracketed class) to
14150                  * store such classes.  The space will contain a bit for each
14151                  * named class that is to be matched against.  This isn't
14152                  * needed for \p{} and pseudo-classes, as they are not affected
14153                  * by locale, and hence are dealt with separately */
14154                 if (! need_class) {
14155                     need_class = 1;
14156                     if (SIZE_ONLY) {
14157                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14158                     }
14159                     else {
14160                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14161                     }
14162                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14163                     ANYOF_POSIXL_ZERO(ret);
14164                 }
14165
14166                 /* Coverity thinks it is possible for this to be negative; both
14167                  * jhi and khw think it's not, but be safer */
14168                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14169                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14170
14171                 /* See if it already matches the complement of this POSIX
14172                  * class */
14173                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14174                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14175                                                             ? -1
14176                                                             : 1)))
14177                 {
14178                     posixl_matches_all = TRUE;
14179                     break;  /* No need to continue.  Since it matches both
14180                                e.g., \w and \W, it matches everything, and the
14181                                bracketed class can be optimized into qr/./s */
14182                 }
14183
14184                 /* Add this class to those that should be checked at runtime */
14185                 ANYOF_POSIXL_SET(ret, namedclass);
14186
14187                 /* The above-Latin1 characters are not subject to locale rules.
14188                  * Just add them, in the second pass, to the
14189                  * unconditionally-matched list */
14190                 if (! SIZE_ONLY) {
14191                     SV* scratch_list = NULL;
14192
14193                     /* Get the list of the above-Latin1 code points this
14194                      * matches */
14195                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14196                                           PL_XPosix_ptrs[classnum],
14197
14198                                           /* Odd numbers are complements, like
14199                                            * NDIGIT, NASCII, ... */
14200                                           namedclass % 2 != 0,
14201                                           &scratch_list);
14202                     /* Checking if 'cp_list' is NULL first saves an extra
14203                      * clone.  Its reference count will be decremented at the
14204                      * next union, etc, or if this is the only instance, at the
14205                      * end of the routine */
14206                     if (! cp_list) {
14207                         cp_list = scratch_list;
14208                     }
14209                     else {
14210                         _invlist_union(cp_list, scratch_list, &cp_list);
14211                         SvREFCNT_dec_NN(scratch_list);
14212                     }
14213                     continue;   /* Go get next character */
14214                 }
14215             }
14216             else if (! SIZE_ONLY) {
14217
14218                 /* Here, not in pass1 (in that pass we skip calculating the
14219                  * contents of this class), and is /l, or is a POSIX class for
14220                  * which /l doesn't matter (or is a Unicode property, which is
14221                  * skipped here). */
14222                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14223                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14224
14225                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14226                          * nor /l make a difference in what these match,
14227                          * therefore we just add what they match to cp_list. */
14228                         if (classnum != _CC_VERTSPACE) {
14229                             assert(   namedclass == ANYOF_HORIZWS
14230                                    || namedclass == ANYOF_NHORIZWS);
14231
14232                             /* It turns out that \h is just a synonym for
14233                              * XPosixBlank */
14234                             classnum = _CC_BLANK;
14235                         }
14236
14237                         _invlist_union_maybe_complement_2nd(
14238                                 cp_list,
14239                                 PL_XPosix_ptrs[classnum],
14240                                 namedclass % 2 != 0,    /* Complement if odd
14241                                                           (NHORIZWS, NVERTWS)
14242                                                         */
14243                                 &cp_list);
14244                     }
14245                 }
14246                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14247                            complement and use nposixes */
14248                     SV** posixes_ptr = namedclass % 2 == 0
14249                                        ? &posixes
14250                                        : &nposixes;
14251                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14252                     _invlist_union_maybe_complement_2nd(
14253                                                      *posixes_ptr,
14254                                                      *source_ptr,
14255                                                      namedclass % 2 != 0,
14256                                                      posixes_ptr);
14257                 }
14258             }
14259         } /* end of namedclass \blah */
14260
14261         if (skip_white) {
14262             RExC_parse = regpatws(pRExC_state, RExC_parse,
14263                                 FALSE /* means don't recognize comments */ );
14264         }
14265
14266         /* If 'range' is set, 'value' is the ending of a range--check its
14267          * validity.  (If value isn't a single code point in the case of a
14268          * range, we should have figured that out above in the code that
14269          * catches false ranges).  Later, we will handle each individual code
14270          * point in the range.  If 'range' isn't set, this could be the
14271          * beginning of a range, so check for that by looking ahead to see if
14272          * the next real character to be processed is the range indicator--the
14273          * minus sign */
14274
14275         if (range) {
14276             if (prevvalue > value) /* b-a */ {
14277                 const int w = RExC_parse - rangebegin;
14278                 vFAIL2utf8f(
14279                     "Invalid [] range \"%"UTF8f"\"",
14280                     UTF8fARG(UTF, w, rangebegin));
14281                 range = 0; /* not a valid range */
14282             }
14283         }
14284         else {
14285             prevvalue = value; /* save the beginning of the potential range */
14286             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14287                 && *RExC_parse == '-')
14288             {
14289                 char* next_char_ptr = RExC_parse + 1;
14290                 if (skip_white) {   /* Get the next real char after the '-' */
14291                     next_char_ptr = regpatws(pRExC_state,
14292                                              RExC_parse + 1,
14293                                              FALSE); /* means don't recognize
14294                                                         comments */
14295                 }
14296
14297                 /* If the '-' is at the end of the class (just before the ']',
14298                  * it is a literal minus; otherwise it is a range */
14299                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14300                     RExC_parse = next_char_ptr;
14301
14302                     /* a bad range like \w-, [:word:]- ? */
14303                     if (namedclass > OOB_NAMEDCLASS) {
14304                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14305                             const int w = RExC_parse >= rangebegin
14306                                           ?  RExC_parse - rangebegin
14307                                           : 0;
14308                             if (strict) {
14309                                 vFAIL4("False [] range \"%*.*s\"",
14310                                     w, w, rangebegin);
14311                             }
14312                             else if (PASS2) {
14313                                 vWARN4(RExC_parse,
14314                                     "False [] range \"%*.*s\"",
14315                                     w, w, rangebegin);
14316                             }
14317                         }
14318                         if (!SIZE_ONLY) {
14319                             cp_list = add_cp_to_invlist(cp_list, '-');
14320                         }
14321                         element_count++;
14322                     } else
14323                         range = 1;      /* yeah, it's a range! */
14324                     continue;   /* but do it the next time */
14325                 }
14326             }
14327         }
14328
14329         if (namedclass > OOB_NAMEDCLASS) {
14330             continue;
14331         }
14332
14333         /* Here, we have a single value, and <prevvalue> is the beginning of
14334          * the range, if any; or <value> if not */
14335
14336         /* non-Latin1 code point implies unicode semantics.  Must be set in
14337          * pass1 so is there for the whole of pass 2 */
14338         if (value > 255) {
14339             RExC_uni_semantics = 1;
14340         }
14341
14342         /* Ready to process either the single value, or the completed range.
14343          * For single-valued non-inverted ranges, we consider the possibility
14344          * of multi-char folds.  (We made a conscious decision to not do this
14345          * for the other cases because it can often lead to non-intuitive
14346          * results.  For example, you have the peculiar case that:
14347          *  "s s" =~ /^[^\xDF]+$/i => Y
14348          *  "ss"  =~ /^[^\xDF]+$/i => N
14349          *
14350          * See [perl #89750] */
14351         if (FOLD && allow_multi_folds && value == prevvalue) {
14352             if (value == LATIN_SMALL_LETTER_SHARP_S
14353                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14354                                                         value)))
14355             {
14356                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14357
14358                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14359                 STRLEN foldlen;
14360
14361                 UV folded = _to_uni_fold_flags(
14362                                 value,
14363                                 foldbuf,
14364                                 &foldlen,
14365                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14366                                                    ? FOLD_FLAGS_NOMIX_ASCII
14367                                                    : 0)
14368                                 );
14369
14370                 /* Here, <folded> should be the first character of the
14371                  * multi-char fold of <value>, with <foldbuf> containing the
14372                  * whole thing.  But, if this fold is not allowed (because of
14373                  * the flags), <fold> will be the same as <value>, and should
14374                  * be processed like any other character, so skip the special
14375                  * handling */
14376                 if (folded != value) {
14377
14378                     /* Skip if we are recursed, currently parsing the class
14379                      * again.  Otherwise add this character to the list of
14380                      * multi-char folds. */
14381                     if (! RExC_in_multi_char_class) {
14382                         STRLEN cp_count = utf8_length(foldbuf,
14383                                                       foldbuf + foldlen);
14384                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14385
14386                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14387
14388                         multi_char_matches
14389                                         = add_multi_match(multi_char_matches,
14390                                                           multi_fold,
14391                                                           cp_count);
14392
14393                     }
14394
14395                     /* This element should not be processed further in this
14396                      * class */
14397                     element_count--;
14398                     value = save_value;
14399                     prevvalue = save_prevvalue;
14400                     continue;
14401                 }
14402             }
14403         }
14404
14405         /* Deal with this element of the class */
14406         if (! SIZE_ONLY) {
14407 #ifndef EBCDIC
14408             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14409                                                      prevvalue, value);
14410 #else
14411             SV* this_range = _new_invlist(1);
14412             _append_range_to_invlist(this_range, prevvalue, value);
14413
14414             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14415              * If this range was specified using something like 'i-j', we want
14416              * to include only the 'i' and the 'j', and not anything in
14417              * between, so exclude non-ASCII, non-alphabetics from it.
14418              * However, if the range was specified with something like
14419              * [\x89-\x91] or [\x89-j], all code points within it should be
14420              * included.  literal_endpoint==2 means both ends of the range used
14421              * a literal character, not \x{foo} */
14422             if (literal_endpoint == 2
14423                 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14424                     || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14425             {
14426                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14427                                       &this_range);
14428
14429                 /* Since 'this_range' now only contains ascii, the intersection
14430                  * of it with anything will still yield only ascii */
14431                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14432                                       &this_range);
14433             }
14434             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14435             literal_endpoint = 0;
14436             SvREFCNT_dec_NN(this_range);
14437 #endif
14438         }
14439
14440         range = 0; /* this range (if it was one) is done now */
14441     } /* End of loop through all the text within the brackets */
14442
14443     /* If anything in the class expands to more than one character, we have to
14444      * deal with them by building up a substitute parse string, and recursively
14445      * calling reg() on it, instead of proceeding */
14446     if (multi_char_matches) {
14447         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14448         I32 cp_count;
14449         STRLEN len;
14450         char *save_end = RExC_end;
14451         char *save_parse = RExC_parse;
14452         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14453                                        a "|" */
14454         I32 reg_flags;
14455
14456         assert(! invert);
14457 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14458            because too confusing */
14459         if (invert) {
14460             sv_catpv(substitute_parse, "(?:");
14461         }
14462 #endif
14463
14464         /* Look at the longest folds first */
14465         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14466
14467             if (av_exists(multi_char_matches, cp_count)) {
14468                 AV** this_array_ptr;
14469                 SV* this_sequence;
14470
14471                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14472                                                  cp_count, FALSE);
14473                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14474                                                                 &PL_sv_undef)
14475                 {
14476                     if (! first_time) {
14477                         sv_catpv(substitute_parse, "|");
14478                     }
14479                     first_time = FALSE;
14480
14481                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14482                 }
14483             }
14484         }
14485
14486         /* If the character class contains anything else besides these
14487          * multi-character folds, have to include it in recursive parsing */
14488         if (element_count) {
14489             sv_catpv(substitute_parse, "|[");
14490             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14491             sv_catpv(substitute_parse, "]");
14492         }
14493
14494         sv_catpv(substitute_parse, ")");
14495 #if 0
14496         if (invert) {
14497             /* This is a way to get the parse to skip forward a whole named
14498              * sequence instead of matching the 2nd character when it fails the
14499              * first */
14500             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14501         }
14502 #endif
14503
14504         RExC_parse = SvPV(substitute_parse, len);
14505         RExC_end = RExC_parse + len;
14506         RExC_in_multi_char_class = 1;
14507         RExC_override_recoding = 1;
14508         RExC_emit = (regnode *)orig_emit;
14509
14510         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14511
14512         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14513
14514         RExC_parse = save_parse;
14515         RExC_end = save_end;
14516         RExC_in_multi_char_class = 0;
14517         RExC_override_recoding = 0;
14518         SvREFCNT_dec_NN(multi_char_matches);
14519         return ret;
14520     }
14521
14522     /* Here, we've gone through the entire class and dealt with multi-char
14523      * folds.  We are now in a position that we can do some checks to see if we
14524      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14525      * Currently we only do two checks:
14526      * 1) is in the unlikely event that the user has specified both, eg. \w and
14527      *    \W under /l, then the class matches everything.  (This optimization
14528      *    is done only to make the optimizer code run later work.)
14529      * 2) if the character class contains only a single element (including a
14530      *    single range), we see if there is an equivalent node for it.
14531      * Other checks are possible */
14532     if (! ret_invlist   /* Can't optimize if returning the constructed
14533                            inversion list */
14534         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14535     {
14536         U8 op = END;
14537         U8 arg = 0;
14538
14539         if (UNLIKELY(posixl_matches_all)) {
14540             op = SANY;
14541         }
14542         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14543                                                    \w or [:digit:] or \p{foo}
14544                                                  */
14545
14546             /* All named classes are mapped into POSIXish nodes, with its FLAG
14547              * argument giving which class it is */
14548             switch ((I32)namedclass) {
14549                 case ANYOF_UNIPROP:
14550                     break;
14551
14552                 /* These don't depend on the charset modifiers.  They always
14553                  * match under /u rules */
14554                 case ANYOF_NHORIZWS:
14555                 case ANYOF_HORIZWS:
14556                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14557                     /* FALLTHROUGH */
14558
14559                 case ANYOF_NVERTWS:
14560                 case ANYOF_VERTWS:
14561                     op = POSIXU;
14562                     goto join_posix;
14563
14564                 /* The actual POSIXish node for all the rest depends on the
14565                  * charset modifier.  The ones in the first set depend only on
14566                  * ASCII or, if available on this platform, locale */
14567                 case ANYOF_ASCII:
14568                 case ANYOF_NASCII:
14569 #ifdef HAS_ISASCII
14570                     op = (LOC) ? POSIXL : POSIXA;
14571 #else
14572                     op = POSIXA;
14573 #endif
14574                     goto join_posix;
14575
14576                 case ANYOF_NCASED:
14577                 case ANYOF_LOWER:
14578                 case ANYOF_NLOWER:
14579                 case ANYOF_UPPER:
14580                 case ANYOF_NUPPER:
14581                     /* under /a could be alpha */
14582                     if (FOLD) {
14583                         if (ASCII_RESTRICTED) {
14584                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14585                         }
14586                         else if (! LOC) {
14587                             break;
14588                         }
14589                     }
14590                     /* FALLTHROUGH */
14591
14592                 /* The rest have more possibilities depending on the charset.
14593                  * We take advantage of the enum ordering of the charset
14594                  * modifiers to get the exact node type, */
14595                 default:
14596                     op = POSIXD + get_regex_charset(RExC_flags);
14597                     if (op > POSIXA) { /* /aa is same as /a */
14598                         op = POSIXA;
14599                     }
14600
14601                 join_posix:
14602                     /* The odd numbered ones are the complements of the
14603                      * next-lower even number one */
14604                     if (namedclass % 2 == 1) {
14605                         invert = ! invert;
14606                         namedclass--;
14607                     }
14608                     arg = namedclass_to_classnum(namedclass);
14609                     break;
14610             }
14611         }
14612         else if (value == prevvalue) {
14613
14614             /* Here, the class consists of just a single code point */
14615
14616             if (invert) {
14617                 if (! LOC && value == '\n') {
14618                     op = REG_ANY; /* Optimize [^\n] */
14619                     *flagp |= HASWIDTH|SIMPLE;
14620                     RExC_naughty++;
14621                 }
14622             }
14623             else if (value < 256 || UTF) {
14624
14625                 /* Optimize a single value into an EXACTish node, but not if it
14626                  * would require converting the pattern to UTF-8. */
14627                 op = compute_EXACTish(pRExC_state);
14628             }
14629         } /* Otherwise is a range */
14630         else if (! LOC) {   /* locale could vary these */
14631             if (prevvalue == '0') {
14632                 if (value == '9') {
14633                     arg = _CC_DIGIT;
14634                     op = POSIXA;
14635                 }
14636             }
14637             else if (prevvalue == 'A') {
14638                 if (value == 'Z'
14639 #ifdef EBCDIC
14640                     && literal_endpoint == 2
14641 #endif
14642                 ) {
14643                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14644                     op = POSIXA;
14645                 }
14646             }
14647             else if (prevvalue == 'a') {
14648                 if (value == 'z'
14649 #ifdef EBCDIC
14650                     && literal_endpoint == 2
14651 #endif
14652                 ) {
14653                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14654                     op = POSIXA;
14655                 }
14656             }
14657         }
14658
14659         /* Here, we have changed <op> away from its initial value iff we found
14660          * an optimization */
14661         if (op != END) {
14662
14663             /* Throw away this ANYOF regnode, and emit the calculated one,
14664              * which should correspond to the beginning, not current, state of
14665              * the parse */
14666             const char * cur_parse = RExC_parse;
14667             RExC_parse = (char *)orig_parse;
14668             if ( SIZE_ONLY) {
14669                 if (! LOC) {
14670
14671                     /* To get locale nodes to not use the full ANYOF size would
14672                      * require moving the code above that writes the portions
14673                      * of it that aren't in other nodes to after this point.
14674                      * e.g.  ANYOF_POSIXL_SET */
14675                     RExC_size = orig_size;
14676                 }
14677             }
14678             else {
14679                 RExC_emit = (regnode *)orig_emit;
14680                 if (PL_regkind[op] == POSIXD) {
14681                     if (op == POSIXL) {
14682                         RExC_contains_locale = 1;
14683                     }
14684                     if (invert) {
14685                         op += NPOSIXD - POSIXD;
14686                     }
14687                 }
14688             }
14689
14690             ret = reg_node(pRExC_state, op);
14691
14692             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14693                 if (! SIZE_ONLY) {
14694                     FLAGS(ret) = arg;
14695                 }
14696                 *flagp |= HASWIDTH|SIMPLE;
14697             }
14698             else if (PL_regkind[op] == EXACT) {
14699                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14700                                            TRUE /* downgradable to EXACT */
14701                                            );
14702             }
14703
14704             RExC_parse = (char *) cur_parse;
14705
14706             SvREFCNT_dec(posixes);
14707             SvREFCNT_dec(nposixes);
14708             SvREFCNT_dec(cp_list);
14709             SvREFCNT_dec(cp_foldable_list);
14710             return ret;
14711         }
14712     }
14713
14714     if (SIZE_ONLY)
14715         return ret;
14716     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14717
14718     /* If folding, we calculate all characters that could fold to or from the
14719      * ones already on the list */
14720     if (cp_foldable_list) {
14721         if (FOLD) {
14722             UV start, end;      /* End points of code point ranges */
14723
14724             SV* fold_intersection = NULL;
14725             SV** use_list;
14726
14727             /* Our calculated list will be for Unicode rules.  For locale
14728              * matching, we have to keep a separate list that is consulted at
14729              * runtime only when the locale indicates Unicode rules.  For
14730              * non-locale, we just use to the general list */
14731             if (LOC) {
14732                 use_list = &only_utf8_locale_list;
14733             }
14734             else {
14735                 use_list = &cp_list;
14736             }
14737
14738             /* Only the characters in this class that participate in folds need
14739              * be checked.  Get the intersection of this class and all the
14740              * possible characters that are foldable.  This can quickly narrow
14741              * down a large class */
14742             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14743                                   &fold_intersection);
14744
14745             /* The folds for all the Latin1 characters are hard-coded into this
14746              * program, but we have to go out to disk to get the others. */
14747             if (invlist_highest(cp_foldable_list) >= 256) {
14748
14749                 /* This is a hash that for a particular fold gives all
14750                  * characters that are involved in it */
14751                 if (! PL_utf8_foldclosures) {
14752                     _load_PL_utf8_foldclosures();
14753                 }
14754             }
14755
14756             /* Now look at the foldable characters in this class individually */
14757             invlist_iterinit(fold_intersection);
14758             while (invlist_iternext(fold_intersection, &start, &end)) {
14759                 UV j;
14760
14761                 /* Look at every character in the range */
14762                 for (j = start; j <= end; j++) {
14763                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14764                     STRLEN foldlen;
14765                     SV** listp;
14766
14767                     if (j < 256) {
14768
14769                         if (IS_IN_SOME_FOLD_L1(j)) {
14770
14771                             /* ASCII is always matched; non-ASCII is matched
14772                              * only under Unicode rules (which could happen
14773                              * under /l if the locale is a UTF-8 one */
14774                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14775                                 *use_list = add_cp_to_invlist(*use_list,
14776                                                             PL_fold_latin1[j]);
14777                             }
14778                             else {
14779                                 depends_list =
14780                                  add_cp_to_invlist(depends_list,
14781                                                    PL_fold_latin1[j]);
14782                             }
14783                         }
14784
14785                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14786                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14787                         {
14788                             add_above_Latin1_folds(pRExC_state,
14789                                                    (U8) j,
14790                                                    use_list);
14791                         }
14792                         continue;
14793                     }
14794
14795                     /* Here is an above Latin1 character.  We don't have the
14796                      * rules hard-coded for it.  First, get its fold.  This is
14797                      * the simple fold, as the multi-character folds have been
14798                      * handled earlier and separated out */
14799                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14800                                                         (ASCII_FOLD_RESTRICTED)
14801                                                         ? FOLD_FLAGS_NOMIX_ASCII
14802                                                         : 0);
14803
14804                     /* Single character fold of above Latin1.  Add everything in
14805                     * its fold closure to the list that this node should match.
14806                     * The fold closures data structure is a hash with the keys
14807                     * being the UTF-8 of every character that is folded to, like
14808                     * 'k', and the values each an array of all code points that
14809                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14810                     * Multi-character folds are not included */
14811                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14812                                         (char *) foldbuf, foldlen, FALSE)))
14813                     {
14814                         AV* list = (AV*) *listp;
14815                         IV k;
14816                         for (k = 0; k <= av_tindex(list); k++) {
14817                             SV** c_p = av_fetch(list, k, FALSE);
14818                             UV c;
14819                             assert(c_p);
14820
14821                             c = SvUV(*c_p);
14822
14823                             /* /aa doesn't allow folds between ASCII and non- */
14824                             if ((ASCII_FOLD_RESTRICTED
14825                                 && (isASCII(c) != isASCII(j))))
14826                             {
14827                                 continue;
14828                             }
14829
14830                             /* Folds under /l which cross the 255/256 boundary
14831                              * are added to a separate list.  (These are valid
14832                              * only when the locale is UTF-8.) */
14833                             if (c < 256 && LOC) {
14834                                 *use_list = add_cp_to_invlist(*use_list, c);
14835                                 continue;
14836                             }
14837
14838                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14839                             {
14840                                 cp_list = add_cp_to_invlist(cp_list, c);
14841                             }
14842                             else {
14843                                 /* Similarly folds involving non-ascii Latin1
14844                                 * characters under /d are added to their list */
14845                                 depends_list = add_cp_to_invlist(depends_list,
14846                                                                  c);
14847                             }
14848                         }
14849                     }
14850                 }
14851             }
14852             SvREFCNT_dec_NN(fold_intersection);
14853         }
14854
14855         /* Now that we have finished adding all the folds, there is no reason
14856          * to keep the foldable list separate */
14857         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14858         SvREFCNT_dec_NN(cp_foldable_list);
14859     }
14860
14861     /* And combine the result (if any) with any inversion list from posix
14862      * classes.  The lists are kept separate up to now because we don't want to
14863      * fold the classes (folding of those is automatically handled by the swash
14864      * fetching code) */
14865     if (posixes || nposixes) {
14866         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14867             /* Under /a and /aa, nothing above ASCII matches these */
14868             _invlist_intersection(posixes,
14869                                   PL_XPosix_ptrs[_CC_ASCII],
14870                                   &posixes);
14871         }
14872         if (nposixes) {
14873             if (DEPENDS_SEMANTICS) {
14874                 /* Under /d, everything in the upper half of the Latin1 range
14875                  * matches these complements */
14876                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
14877             }
14878             else if (AT_LEAST_ASCII_RESTRICTED) {
14879                 /* Under /a and /aa, everything above ASCII matches these
14880                  * complements */
14881                 _invlist_union_complement_2nd(nposixes,
14882                                               PL_XPosix_ptrs[_CC_ASCII],
14883                                               &nposixes);
14884             }
14885             if (posixes) {
14886                 _invlist_union(posixes, nposixes, &posixes);
14887                 SvREFCNT_dec_NN(nposixes);
14888             }
14889             else {
14890                 posixes = nposixes;
14891             }
14892         }
14893         if (! DEPENDS_SEMANTICS) {
14894             if (cp_list) {
14895                 _invlist_union(cp_list, posixes, &cp_list);
14896                 SvREFCNT_dec_NN(posixes);
14897             }
14898             else {
14899                 cp_list = posixes;
14900             }
14901         }
14902         else {
14903             /* Under /d, we put into a separate list the Latin1 things that
14904              * match only when the target string is utf8 */
14905             SV* nonascii_but_latin1_properties = NULL;
14906             _invlist_intersection(posixes, PL_UpperLatin1,
14907                                   &nonascii_but_latin1_properties);
14908             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14909                               &posixes);
14910             if (cp_list) {
14911                 _invlist_union(cp_list, posixes, &cp_list);
14912                 SvREFCNT_dec_NN(posixes);
14913             }
14914             else {
14915                 cp_list = posixes;
14916             }
14917
14918             if (depends_list) {
14919                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14920                                &depends_list);
14921                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14922             }
14923             else {
14924                 depends_list = nonascii_but_latin1_properties;
14925             }
14926         }
14927     }
14928
14929     /* And combine the result (if any) with any inversion list from properties.
14930      * The lists are kept separate up to now so that we can distinguish the two
14931      * in regards to matching above-Unicode.  A run-time warning is generated
14932      * if a Unicode property is matched against a non-Unicode code point. But,
14933      * we allow user-defined properties to match anything, without any warning,
14934      * and we also suppress the warning if there is a portion of the character
14935      * class that isn't a Unicode property, and which matches above Unicode, \W
14936      * or [\x{110000}] for example.
14937      * (Note that in this case, unlike the Posix one above, there is no
14938      * <depends_list>, because having a Unicode property forces Unicode
14939      * semantics */
14940     if (properties) {
14941         if (cp_list) {
14942
14943             /* If it matters to the final outcome, see if a non-property
14944              * component of the class matches above Unicode.  If so, the
14945              * warning gets suppressed.  This is true even if just a single
14946              * such code point is specified, as though not strictly correct if
14947              * another such code point is matched against, the fact that they
14948              * are using above-Unicode code points indicates they should know
14949              * the issues involved */
14950             if (warn_super) {
14951                 warn_super = ! (invert
14952                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14953             }
14954
14955             _invlist_union(properties, cp_list, &cp_list);
14956             SvREFCNT_dec_NN(properties);
14957         }
14958         else {
14959             cp_list = properties;
14960         }
14961
14962         if (warn_super) {
14963             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14964         }
14965     }
14966
14967     /* Here, we have calculated what code points should be in the character
14968      * class.
14969      *
14970      * Now we can see about various optimizations.  Fold calculation (which we
14971      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14972      * would invert to include K, which under /i would match k, which it
14973      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14974      * folded until runtime */
14975
14976     /* If we didn't do folding, it's because some information isn't available
14977      * until runtime; set the run-time fold flag for these.  (We don't have to
14978      * worry about properties folding, as that is taken care of by the swash
14979      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14980      * locales, or the class matches at least one 0-255 range code point */
14981     if (LOC && FOLD) {
14982         if (only_utf8_locale_list) {
14983             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14984         }
14985         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14986                                the list */
14987             UV start, end;
14988             invlist_iterinit(cp_list);
14989             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14990                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14991             }
14992             invlist_iterfinish(cp_list);
14993         }
14994     }
14995
14996     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14997      * at compile time.  Besides not inverting folded locale now, we can't
14998      * invert if there are things such as \w, which aren't known until runtime
14999      * */
15000     if (cp_list
15001         && invert
15002         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15003         && ! depends_list
15004         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15005     {
15006         _invlist_invert(cp_list);
15007
15008         /* Any swash can't be used as-is, because we've inverted things */
15009         if (swash) {
15010             SvREFCNT_dec_NN(swash);
15011             swash = NULL;
15012         }
15013
15014         /* Clear the invert flag since have just done it here */
15015         invert = FALSE;
15016     }
15017
15018     if (ret_invlist) {
15019         *ret_invlist = cp_list;
15020         SvREFCNT_dec(swash);
15021
15022         /* Discard the generated node */
15023         if (SIZE_ONLY) {
15024             RExC_size = orig_size;
15025         }
15026         else {
15027             RExC_emit = orig_emit;
15028         }
15029         return orig_emit;
15030     }
15031
15032     /* Some character classes are equivalent to other nodes.  Such nodes take
15033      * up less room and generally fewer operations to execute than ANYOF nodes.
15034      * Above, we checked for and optimized into some such equivalents for
15035      * certain common classes that are easy to test.  Getting to this point in
15036      * the code means that the class didn't get optimized there.  Since this
15037      * code is only executed in Pass 2, it is too late to save space--it has
15038      * been allocated in Pass 1, and currently isn't given back.  But turning
15039      * things into an EXACTish node can allow the optimizer to join it to any
15040      * adjacent such nodes.  And if the class is equivalent to things like /./,
15041      * expensive run-time swashes can be avoided.  Now that we have more
15042      * complete information, we can find things necessarily missed by the
15043      * earlier code.  I (khw) am not sure how much to look for here.  It would
15044      * be easy, but perhaps too slow, to check any candidates against all the
15045      * node types they could possibly match using _invlistEQ(). */
15046
15047     if (cp_list
15048         && ! invert
15049         && ! depends_list
15050         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15051         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15052
15053            /* We don't optimize if we are supposed to make sure all non-Unicode
15054             * code points raise a warning, as only ANYOF nodes have this check.
15055             * */
15056         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15057     {
15058         UV start, end;
15059         U8 op = END;  /* The optimzation node-type */
15060         const char * cur_parse= RExC_parse;
15061
15062         invlist_iterinit(cp_list);
15063         if (! invlist_iternext(cp_list, &start, &end)) {
15064
15065             /* Here, the list is empty.  This happens, for example, when a
15066              * Unicode property is the only thing in the character class, and
15067              * it doesn't match anything.  (perluniprops.pod notes such
15068              * properties) */
15069             op = OPFAIL;
15070             *flagp |= HASWIDTH|SIMPLE;
15071         }
15072         else if (start == end) {    /* The range is a single code point */
15073             if (! invlist_iternext(cp_list, &start, &end)
15074
15075                     /* Don't do this optimization if it would require changing
15076                      * the pattern to UTF-8 */
15077                 && (start < 256 || UTF))
15078             {
15079                 /* Here, the list contains a single code point.  Can optimize
15080                  * into an EXACTish node */
15081
15082                 value = start;
15083
15084                 if (! FOLD) {
15085                     op = EXACT;
15086                 }
15087                 else if (LOC) {
15088
15089                     /* A locale node under folding with one code point can be
15090                      * an EXACTFL, as its fold won't be calculated until
15091                      * runtime */
15092                     op = EXACTFL;
15093                 }
15094                 else {
15095
15096                     /* Here, we are generally folding, but there is only one
15097                      * code point to match.  If we have to, we use an EXACT
15098                      * node, but it would be better for joining with adjacent
15099                      * nodes in the optimization pass if we used the same
15100                      * EXACTFish node that any such are likely to be.  We can
15101                      * do this iff the code point doesn't participate in any
15102                      * folds.  For example, an EXACTF of a colon is the same as
15103                      * an EXACT one, since nothing folds to or from a colon. */
15104                     if (value < 256) {
15105                         if (IS_IN_SOME_FOLD_L1(value)) {
15106                             op = EXACT;
15107                         }
15108                     }
15109                     else {
15110                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15111                             op = EXACT;
15112                         }
15113                     }
15114
15115                     /* If we haven't found the node type, above, it means we
15116                      * can use the prevailing one */
15117                     if (op == END) {
15118                         op = compute_EXACTish(pRExC_state);
15119                     }
15120                 }
15121             }
15122         }
15123         else if (start == 0) {
15124             if (end == UV_MAX) {
15125                 op = SANY;
15126                 *flagp |= HASWIDTH|SIMPLE;
15127                 RExC_naughty++;
15128             }
15129             else if (end == '\n' - 1
15130                     && invlist_iternext(cp_list, &start, &end)
15131                     && start == '\n' + 1 && end == UV_MAX)
15132             {
15133                 op = REG_ANY;
15134                 *flagp |= HASWIDTH|SIMPLE;
15135                 RExC_naughty++;
15136             }
15137         }
15138         invlist_iterfinish(cp_list);
15139
15140         if (op != END) {
15141             RExC_parse = (char *)orig_parse;
15142             RExC_emit = (regnode *)orig_emit;
15143
15144             ret = reg_node(pRExC_state, op);
15145
15146             RExC_parse = (char *)cur_parse;
15147
15148             if (PL_regkind[op] == EXACT) {
15149                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15150                                            TRUE /* downgradable to EXACT */
15151                                           );
15152             }
15153
15154             SvREFCNT_dec_NN(cp_list);
15155             return ret;
15156         }
15157     }
15158
15159     /* Here, <cp_list> contains all the code points we can determine at
15160      * compile time that match under all conditions.  Go through it, and
15161      * for things that belong in the bitmap, put them there, and delete from
15162      * <cp_list>.  While we are at it, see if everything above 255 is in the
15163      * list, and if so, set a flag to speed up execution */
15164
15165     populate_ANYOF_from_invlist(ret, &cp_list);
15166
15167     if (invert) {
15168         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15169     }
15170
15171     /* Here, the bitmap has been populated with all the Latin1 code points that
15172      * always match.  Can now add to the overall list those that match only
15173      * when the target string is UTF-8 (<depends_list>). */
15174     if (depends_list) {
15175         if (cp_list) {
15176             _invlist_union(cp_list, depends_list, &cp_list);
15177             SvREFCNT_dec_NN(depends_list);
15178         }
15179         else {
15180             cp_list = depends_list;
15181         }
15182         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15183     }
15184
15185     /* If there is a swash and more than one element, we can't use the swash in
15186      * the optimization below. */
15187     if (swash && element_count > 1) {
15188         SvREFCNT_dec_NN(swash);
15189         swash = NULL;
15190     }
15191
15192     /* Note that the optimization of using 'swash' if it is the only thing in
15193      * the class doesn't have us change swash at all, so it can include things
15194      * that are also in the bitmap; otherwise we have purposely deleted that
15195      * duplicate information */
15196     set_ANYOF_arg(pRExC_state, ret, cp_list,
15197                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15198                    ? listsv : NULL,
15199                   only_utf8_locale_list,
15200                   swash, has_user_defined_property);
15201
15202     *flagp |= HASWIDTH|SIMPLE;
15203
15204     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15205         RExC_contains_locale = 1;
15206     }
15207
15208     return ret;
15209 }
15210
15211 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15212
15213 STATIC void
15214 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15215                 regnode* const node,
15216                 SV* const cp_list,
15217                 SV* const runtime_defns,
15218                 SV* const only_utf8_locale_list,
15219                 SV* const swash,
15220                 const bool has_user_defined_property)
15221 {
15222     /* Sets the arg field of an ANYOF-type node 'node', using information about
15223      * the node passed-in.  If there is nothing outside the node's bitmap, the
15224      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15225      * the count returned by add_data(), having allocated and stored an array,
15226      * av, that that count references, as follows:
15227      *  av[0] stores the character class description in its textual form.
15228      *        This is used later (regexec.c:Perl_regclass_swash()) to
15229      *        initialize the appropriate swash, and is also useful for dumping
15230      *        the regnode.  This is set to &PL_sv_undef if the textual
15231      *        description is not needed at run-time (as happens if the other
15232      *        elements completely define the class)
15233      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15234      *        computed from av[0].  But if no further computation need be done,
15235      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15236      *  av[2] stores the inversion list of code points that match only if the
15237      *        current locale is UTF-8
15238      *  av[3] stores the cp_list inversion list for use in addition or instead
15239      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15240      *        (Otherwise everything needed is already in av[0] and av[1])
15241      *  av[4] is set if any component of the class is from a user-defined
15242      *        property; used only if av[3] exists */
15243
15244     UV n;
15245
15246     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15247
15248     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15249         assert(! (ANYOF_FLAGS(node)
15250                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15251                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15252         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15253     }
15254     else {
15255         AV * const av = newAV();
15256         SV *rv;
15257
15258         assert(ANYOF_FLAGS(node)
15259                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15260                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15261
15262         av_store(av, 0, (runtime_defns)
15263                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15264         if (swash) {
15265             assert(cp_list);
15266             av_store(av, 1, swash);
15267             SvREFCNT_dec_NN(cp_list);
15268         }
15269         else {
15270             av_store(av, 1, &PL_sv_undef);
15271             if (cp_list) {
15272                 av_store(av, 3, cp_list);
15273                 av_store(av, 4, newSVuv(has_user_defined_property));
15274             }
15275         }
15276
15277         if (only_utf8_locale_list) {
15278             av_store(av, 2, only_utf8_locale_list);
15279         }
15280         else {
15281             av_store(av, 2, &PL_sv_undef);
15282         }
15283
15284         rv = newRV_noinc(MUTABLE_SV(av));
15285         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15286         RExC_rxi->data->data[n] = (void*)rv;
15287         ARG_SET(node, n);
15288     }
15289 }
15290
15291 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15292 SV *
15293 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15294                                         const regnode* node,
15295                                         bool doinit,
15296                                         SV** listsvp,
15297                                         SV** only_utf8_locale_ptr,
15298                                         SV*  exclude_list)
15299
15300 {
15301     /* For internal core use only.
15302      * Returns the swash for the input 'node' in the regex 'prog'.
15303      * If <doinit> is 'true', will attempt to create the swash if not already
15304      *    done.
15305      * If <listsvp> is non-null, will return the printable contents of the
15306      *    swash.  This can be used to get debugging information even before the
15307      *    swash exists, by calling this function with 'doinit' set to false, in
15308      *    which case the components that will be used to eventually create the
15309      *    swash are returned  (in a printable form).
15310      * If <exclude_list> is not NULL, it is an inversion list of things to
15311      *    exclude from what's returned in <listsvp>.
15312      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15313      * that, in spite of this function's name, the swash it returns may include
15314      * the bitmap data as well */
15315
15316     SV *sw  = NULL;
15317     SV *si  = NULL;         /* Input swash initialization string */
15318     SV*  invlist = NULL;
15319
15320     RXi_GET_DECL(prog,progi);
15321     const struct reg_data * const data = prog ? progi->data : NULL;
15322
15323     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15324
15325     assert(ANYOF_FLAGS(node)
15326         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15327            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15328
15329     if (data && data->count) {
15330         const U32 n = ARG(node);
15331
15332         if (data->what[n] == 's') {
15333             SV * const rv = MUTABLE_SV(data->data[n]);
15334             AV * const av = MUTABLE_AV(SvRV(rv));
15335             SV **const ary = AvARRAY(av);
15336             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15337
15338             si = *ary;  /* ary[0] = the string to initialize the swash with */
15339
15340             /* Elements 3 and 4 are either both present or both absent. [3] is
15341              * any inversion list generated at compile time; [4] indicates if
15342              * that inversion list has any user-defined properties in it. */
15343             if (av_tindex(av) >= 2) {
15344                 if (only_utf8_locale_ptr
15345                     && ary[2]
15346                     && ary[2] != &PL_sv_undef)
15347                 {
15348                     *only_utf8_locale_ptr = ary[2];
15349                 }
15350                 else {
15351                     assert(only_utf8_locale_ptr);
15352                     *only_utf8_locale_ptr = NULL;
15353                 }
15354
15355                 if (av_tindex(av) >= 3) {
15356                     invlist = ary[3];
15357                     if (SvUV(ary[4])) {
15358                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15359                     }
15360                 }
15361                 else {
15362                     invlist = NULL;
15363                 }
15364             }
15365
15366             /* Element [1] is reserved for the set-up swash.  If already there,
15367              * return it; if not, create it and store it there */
15368             if (ary[1] && SvROK(ary[1])) {
15369                 sw = ary[1];
15370             }
15371             else if (doinit && ((si && si != &PL_sv_undef)
15372                                  || (invlist && invlist != &PL_sv_undef))) {
15373                 assert(si);
15374                 sw = _core_swash_init("utf8", /* the utf8 package */
15375                                       "", /* nameless */
15376                                       si,
15377                                       1, /* binary */
15378                                       0, /* not from tr/// */
15379                                       invlist,
15380                                       &swash_init_flags);
15381                 (void)av_store(av, 1, sw);
15382             }
15383         }
15384     }
15385
15386     /* If requested, return a printable version of what this swash matches */
15387     if (listsvp) {
15388         SV* matches_string = newSVpvs("");
15389
15390         /* The swash should be used, if possible, to get the data, as it
15391          * contains the resolved data.  But this function can be called at
15392          * compile-time, before everything gets resolved, in which case we
15393          * return the currently best available information, which is the string
15394          * that will eventually be used to do that resolving, 'si' */
15395         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15396             && (si && si != &PL_sv_undef))
15397         {
15398             sv_catsv(matches_string, si);
15399         }
15400
15401         /* Add the inversion list to whatever we have.  This may have come from
15402          * the swash, or from an input parameter */
15403         if (invlist) {
15404             if (exclude_list) {
15405                 SV* clone = invlist_clone(invlist);
15406                 _invlist_subtract(clone, exclude_list, &clone);
15407                 sv_catsv(matches_string, _invlist_contents(clone));
15408                 SvREFCNT_dec_NN(clone);
15409             }
15410             else {
15411                 sv_catsv(matches_string, _invlist_contents(invlist));
15412             }
15413         }
15414         *listsvp = matches_string;
15415     }
15416
15417     return sw;
15418 }
15419 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15420
15421 /* reg_skipcomment()
15422
15423    Absorbs an /x style # comment from the input stream,
15424    returning a pointer to the first character beyond the comment, or if the
15425    comment terminates the pattern without anything following it, this returns
15426    one past the final character of the pattern (in other words, RExC_end) and
15427    sets the REG_RUN_ON_COMMENT_SEEN flag.
15428
15429    Note it's the callers responsibility to ensure that we are
15430    actually in /x mode
15431
15432 */
15433
15434 PERL_STATIC_INLINE char*
15435 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15436 {
15437     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15438
15439     assert(*p == '#');
15440
15441     while (p < RExC_end) {
15442         if (*(++p) == '\n') {
15443             return p+1;
15444         }
15445     }
15446
15447     /* we ran off the end of the pattern without ending the comment, so we have
15448      * to add an \n when wrapping */
15449     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15450     return p;
15451 }
15452
15453 /* nextchar()
15454
15455    Advances the parse position, and optionally absorbs
15456    "whitespace" from the inputstream.
15457
15458    Without /x "whitespace" means (?#...) style comments only,
15459    with /x this means (?#...) and # comments and whitespace proper.
15460
15461    Returns the RExC_parse point from BEFORE the scan occurs.
15462
15463    This is the /x friendly way of saying RExC_parse++.
15464 */
15465
15466 STATIC char*
15467 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15468 {
15469     char* const retval = RExC_parse++;
15470
15471     PERL_ARGS_ASSERT_NEXTCHAR;
15472
15473     for (;;) {
15474         if (RExC_end - RExC_parse >= 3
15475             && *RExC_parse == '('
15476             && RExC_parse[1] == '?'
15477             && RExC_parse[2] == '#')
15478         {
15479             while (*RExC_parse != ')') {
15480                 if (RExC_parse == RExC_end)
15481                     FAIL("Sequence (?#... not terminated");
15482                 RExC_parse++;
15483             }
15484             RExC_parse++;
15485             continue;
15486         }
15487         if (RExC_flags & RXf_PMf_EXTENDED) {
15488             char * p = regpatws(pRExC_state, RExC_parse,
15489                                           TRUE); /* means recognize comments */
15490             if (p != RExC_parse) {
15491                 RExC_parse = p;
15492                 continue;
15493             }
15494         }
15495         return retval;
15496     }
15497 }
15498
15499 STATIC regnode *
15500 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15501 {
15502     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15503      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15504      * RExC_emit */
15505
15506     regnode * const ret = RExC_emit;
15507     GET_RE_DEBUG_FLAGS_DECL;
15508
15509     PERL_ARGS_ASSERT_REGNODE_GUTS;
15510
15511     assert(extra_size >= regarglen[op]);
15512
15513     if (SIZE_ONLY) {
15514         SIZE_ALIGN(RExC_size);
15515         RExC_size += 1 + extra_size;
15516         return(ret);
15517     }
15518     if (RExC_emit >= RExC_emit_bound)
15519         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15520                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15521
15522     NODE_ALIGN_FILL(ret);
15523 #ifndef RE_TRACK_PATTERN_OFFSETS
15524     PERL_UNUSED_ARG(name);
15525 #else
15526     if (RExC_offsets) {         /* MJD */
15527         MJD_OFFSET_DEBUG(
15528               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15529               name, __LINE__,
15530               PL_reg_name[op],
15531               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15532                 ? "Overwriting end of array!\n" : "OK",
15533               (UV)(RExC_emit - RExC_emit_start),
15534               (UV)(RExC_parse - RExC_start),
15535               (UV)RExC_offsets[0]));
15536         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15537     }
15538 #endif
15539     return(ret);
15540 }
15541
15542 /*
15543 - reg_node - emit a node
15544 */
15545 STATIC regnode *                        /* Location. */
15546 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15547 {
15548     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15549
15550     PERL_ARGS_ASSERT_REG_NODE;
15551
15552     assert(regarglen[op] == 0);
15553
15554     if (PASS2) {
15555         regnode *ptr = ret;
15556         FILL_ADVANCE_NODE(ptr, op);
15557         RExC_emit = ptr;
15558     }
15559     return(ret);
15560 }
15561
15562 /*
15563 - reganode - emit a node with an argument
15564 */
15565 STATIC regnode *                        /* Location. */
15566 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15567 {
15568     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15569
15570     PERL_ARGS_ASSERT_REGANODE;
15571
15572     assert(regarglen[op] == 1);
15573
15574     if (PASS2) {
15575         regnode *ptr = ret;
15576         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15577         RExC_emit = ptr;
15578     }
15579     return(ret);
15580 }
15581
15582 STATIC regnode *
15583 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15584 {
15585     /* emit a node with U32 and I32 arguments */
15586
15587     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15588
15589     PERL_ARGS_ASSERT_REG2LANODE;
15590
15591     assert(regarglen[op] == 2);
15592
15593     if (PASS2) {
15594         regnode *ptr = ret;
15595         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15596         RExC_emit = ptr;
15597     }
15598     return(ret);
15599 }
15600
15601 /*
15602 - reguni - emit (if appropriate) a Unicode character
15603 */
15604 PERL_STATIC_INLINE STRLEN
15605 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15606 {
15607     PERL_ARGS_ASSERT_REGUNI;
15608
15609     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15610 }
15611
15612 /*
15613 - reginsert - insert an operator in front of already-emitted operand
15614 *
15615 * Means relocating the operand.
15616 */
15617 STATIC void
15618 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15619 {
15620     regnode *src;
15621     regnode *dst;
15622     regnode *place;
15623     const int offset = regarglen[(U8)op];
15624     const int size = NODE_STEP_REGNODE + offset;
15625     GET_RE_DEBUG_FLAGS_DECL;
15626
15627     PERL_ARGS_ASSERT_REGINSERT;
15628     PERL_UNUSED_CONTEXT;
15629     PERL_UNUSED_ARG(depth);
15630 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15631     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15632     if (SIZE_ONLY) {
15633         RExC_size += size;
15634         return;
15635     }
15636
15637     src = RExC_emit;
15638     RExC_emit += size;
15639     dst = RExC_emit;
15640     if (RExC_open_parens) {
15641         int paren;
15642         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15643         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15644             if ( RExC_open_parens[paren] >= opnd ) {
15645                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15646                 RExC_open_parens[paren] += size;
15647             } else {
15648                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15649             }
15650             if ( RExC_close_parens[paren] >= opnd ) {
15651                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15652                 RExC_close_parens[paren] += size;
15653             } else {
15654                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15655             }
15656         }
15657     }
15658
15659     while (src > opnd) {
15660         StructCopy(--src, --dst, regnode);
15661 #ifdef RE_TRACK_PATTERN_OFFSETS
15662         if (RExC_offsets) {     /* MJD 20010112 */
15663             MJD_OFFSET_DEBUG(
15664                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15665                   "reg_insert",
15666                   __LINE__,
15667                   PL_reg_name[op],
15668                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15669                     ? "Overwriting end of array!\n" : "OK",
15670                   (UV)(src - RExC_emit_start),
15671                   (UV)(dst - RExC_emit_start),
15672                   (UV)RExC_offsets[0]));
15673             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15674             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15675         }
15676 #endif
15677     }
15678
15679
15680     place = opnd;               /* Op node, where operand used to be. */
15681 #ifdef RE_TRACK_PATTERN_OFFSETS
15682     if (RExC_offsets) {         /* MJD */
15683         MJD_OFFSET_DEBUG(
15684               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15685               "reginsert",
15686               __LINE__,
15687               PL_reg_name[op],
15688               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15689               ? "Overwriting end of array!\n" : "OK",
15690               (UV)(place - RExC_emit_start),
15691               (UV)(RExC_parse - RExC_start),
15692               (UV)RExC_offsets[0]));
15693         Set_Node_Offset(place, RExC_parse);
15694         Set_Node_Length(place, 1);
15695     }
15696 #endif
15697     src = NEXTOPER(place);
15698     FILL_ADVANCE_NODE(place, op);
15699     Zero(src, offset, regnode);
15700 }
15701
15702 /*
15703 - regtail - set the next-pointer at the end of a node chain of p to val.
15704 - SEE ALSO: regtail_study
15705 */
15706 /* TODO: All three parms should be const */
15707 STATIC void
15708 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15709                 const regnode *val,U32 depth)
15710 {
15711     regnode *scan;
15712     GET_RE_DEBUG_FLAGS_DECL;
15713
15714     PERL_ARGS_ASSERT_REGTAIL;
15715 #ifndef DEBUGGING
15716     PERL_UNUSED_ARG(depth);
15717 #endif
15718
15719     if (SIZE_ONLY)
15720         return;
15721
15722     /* Find last node. */
15723     scan = p;
15724     for (;;) {
15725         regnode * const temp = regnext(scan);
15726         DEBUG_PARSE_r({
15727             SV * const mysv=sv_newmortal();
15728             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15729             regprop(RExC_rx, mysv, scan, NULL);
15730             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15731                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15732                     (temp == NULL ? "->" : ""),
15733                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15734             );
15735         });
15736         if (temp == NULL)
15737             break;
15738         scan = temp;
15739     }
15740
15741     if (reg_off_by_arg[OP(scan)]) {
15742         ARG_SET(scan, val - scan);
15743     }
15744     else {
15745         NEXT_OFF(scan) = val - scan;
15746     }
15747 }
15748
15749 #ifdef DEBUGGING
15750 /*
15751 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15752 - Look for optimizable sequences at the same time.
15753 - currently only looks for EXACT chains.
15754
15755 This is experimental code. The idea is to use this routine to perform
15756 in place optimizations on branches and groups as they are constructed,
15757 with the long term intention of removing optimization from study_chunk so
15758 that it is purely analytical.
15759
15760 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15761 to control which is which.
15762
15763 */
15764 /* TODO: All four parms should be const */
15765
15766 STATIC U8
15767 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15768                       const regnode *val,U32 depth)
15769 {
15770     regnode *scan;
15771     U8 exact = PSEUDO;
15772 #ifdef EXPERIMENTAL_INPLACESCAN
15773     I32 min = 0;
15774 #endif
15775     GET_RE_DEBUG_FLAGS_DECL;
15776
15777     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15778
15779
15780     if (SIZE_ONLY)
15781         return exact;
15782
15783     /* Find last node. */
15784
15785     scan = p;
15786     for (;;) {
15787         regnode * const temp = regnext(scan);
15788 #ifdef EXPERIMENTAL_INPLACESCAN
15789         if (PL_regkind[OP(scan)] == EXACT) {
15790             bool unfolded_multi_char;   /* Unexamined in this routine */
15791             if (join_exact(pRExC_state, scan, &min,
15792                            &unfolded_multi_char, 1, val, depth+1))
15793                 return EXACT;
15794         }
15795 #endif
15796         if ( exact ) {
15797             switch (OP(scan)) {
15798                 case EXACT:
15799                 case EXACTF:
15800                 case EXACTFA_NO_TRIE:
15801                 case EXACTFA:
15802                 case EXACTFU:
15803                 case EXACTFU_SS:
15804                 case EXACTFL:
15805                         if( exact == PSEUDO )
15806                             exact= OP(scan);
15807                         else if ( exact != OP(scan) )
15808                             exact= 0;
15809                 case NOTHING:
15810                     break;
15811                 default:
15812                     exact= 0;
15813             }
15814         }
15815         DEBUG_PARSE_r({
15816             SV * const mysv=sv_newmortal();
15817             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15818             regprop(RExC_rx, mysv, scan, NULL);
15819             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15820                 SvPV_nolen_const(mysv),
15821                 REG_NODE_NUM(scan),
15822                 PL_reg_name[exact]);
15823         });
15824         if (temp == NULL)
15825             break;
15826         scan = temp;
15827     }
15828     DEBUG_PARSE_r({
15829         SV * const mysv_val=sv_newmortal();
15830         DEBUG_PARSE_MSG("");
15831         regprop(RExC_rx, mysv_val, val, NULL);
15832         PerlIO_printf(Perl_debug_log,
15833                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15834                       SvPV_nolen_const(mysv_val),
15835                       (IV)REG_NODE_NUM(val),
15836                       (IV)(val - scan)
15837         );
15838     });
15839     if (reg_off_by_arg[OP(scan)]) {
15840         ARG_SET(scan, val - scan);
15841     }
15842     else {
15843         NEXT_OFF(scan) = val - scan;
15844     }
15845
15846     return exact;
15847 }
15848 #endif
15849
15850 /*
15851  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15852  */
15853 #ifdef DEBUGGING
15854
15855 static void
15856 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15857 {
15858     int bit;
15859     int set=0;
15860
15861     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15862
15863     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15864         if (flags & (1<<bit)) {
15865             if (!set++ && lead)
15866                 PerlIO_printf(Perl_debug_log, "%s",lead);
15867             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15868         }
15869     }
15870     if (lead)  {
15871         if (set)
15872             PerlIO_printf(Perl_debug_log, "\n");
15873         else
15874             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15875     }
15876 }
15877
15878 static void
15879 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15880 {
15881     int bit;
15882     int set=0;
15883     regex_charset cs;
15884
15885     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15886
15887     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15888         if (flags & (1<<bit)) {
15889             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15890                 continue;
15891             }
15892             if (!set++ && lead)
15893                 PerlIO_printf(Perl_debug_log, "%s",lead);
15894             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15895         }
15896     }
15897     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15898             if (!set++ && lead) {
15899                 PerlIO_printf(Perl_debug_log, "%s",lead);
15900             }
15901             switch (cs) {
15902                 case REGEX_UNICODE_CHARSET:
15903                     PerlIO_printf(Perl_debug_log, "UNICODE");
15904                     break;
15905                 case REGEX_LOCALE_CHARSET:
15906                     PerlIO_printf(Perl_debug_log, "LOCALE");
15907                     break;
15908                 case REGEX_ASCII_RESTRICTED_CHARSET:
15909                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15910                     break;
15911                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15912                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15913                     break;
15914                 default:
15915                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15916                     break;
15917             }
15918     }
15919     if (lead)  {
15920         if (set)
15921             PerlIO_printf(Perl_debug_log, "\n");
15922         else
15923             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15924     }
15925 }
15926 #endif
15927
15928 void
15929 Perl_regdump(pTHX_ const regexp *r)
15930 {
15931 #ifdef DEBUGGING
15932     SV * const sv = sv_newmortal();
15933     SV *dsv= sv_newmortal();
15934     RXi_GET_DECL(r,ri);
15935     GET_RE_DEBUG_FLAGS_DECL;
15936
15937     PERL_ARGS_ASSERT_REGDUMP;
15938
15939     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15940
15941     /* Header fields of interest. */
15942     if (r->anchored_substr) {
15943         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15944             RE_SV_DUMPLEN(r->anchored_substr), 30);
15945         PerlIO_printf(Perl_debug_log,
15946                       "anchored %s%s at %"IVdf" ",
15947                       s, RE_SV_TAIL(r->anchored_substr),
15948                       (IV)r->anchored_offset);
15949     } else if (r->anchored_utf8) {
15950         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15951             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15952         PerlIO_printf(Perl_debug_log,
15953                       "anchored utf8 %s%s at %"IVdf" ",
15954                       s, RE_SV_TAIL(r->anchored_utf8),
15955                       (IV)r->anchored_offset);
15956     }
15957     if (r->float_substr) {
15958         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15959             RE_SV_DUMPLEN(r->float_substr), 30);
15960         PerlIO_printf(Perl_debug_log,
15961                       "floating %s%s at %"IVdf"..%"UVuf" ",
15962                       s, RE_SV_TAIL(r->float_substr),
15963                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15964     } else if (r->float_utf8) {
15965         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15966             RE_SV_DUMPLEN(r->float_utf8), 30);
15967         PerlIO_printf(Perl_debug_log,
15968                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15969                       s, RE_SV_TAIL(r->float_utf8),
15970                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15971     }
15972     if (r->check_substr || r->check_utf8)
15973         PerlIO_printf(Perl_debug_log,
15974                       (const char *)
15975                       (r->check_substr == r->float_substr
15976                        && r->check_utf8 == r->float_utf8
15977                        ? "(checking floating" : "(checking anchored"));
15978     if (r->intflags & PREGf_NOSCAN)
15979         PerlIO_printf(Perl_debug_log, " noscan");
15980     if (r->extflags & RXf_CHECK_ALL)
15981         PerlIO_printf(Perl_debug_log, " isall");
15982     if (r->check_substr || r->check_utf8)
15983         PerlIO_printf(Perl_debug_log, ") ");
15984
15985     if (ri->regstclass) {
15986         regprop(r, sv, ri->regstclass, NULL);
15987         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15988     }
15989     if (r->intflags & PREGf_ANCH) {
15990         PerlIO_printf(Perl_debug_log, "anchored");
15991         if (r->intflags & PREGf_ANCH_MBOL)
15992             PerlIO_printf(Perl_debug_log, "(MBOL)");
15993         if (r->intflags & PREGf_ANCH_SBOL)
15994             PerlIO_printf(Perl_debug_log, "(SBOL)");
15995         if (r->intflags & PREGf_ANCH_GPOS)
15996             PerlIO_printf(Perl_debug_log, "(GPOS)");
15997         PerlIO_putc(Perl_debug_log, ' ');
15998     }
15999     if (r->intflags & PREGf_GPOS_SEEN)
16000         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16001     if (r->intflags & PREGf_SKIP)
16002         PerlIO_printf(Perl_debug_log, "plus ");
16003     if (r->intflags & PREGf_IMPLICIT)
16004         PerlIO_printf(Perl_debug_log, "implicit ");
16005     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16006     if (r->extflags & RXf_EVAL_SEEN)
16007         PerlIO_printf(Perl_debug_log, "with eval ");
16008     PerlIO_printf(Perl_debug_log, "\n");
16009     DEBUG_FLAGS_r({
16010         regdump_extflags("r->extflags: ",r->extflags);
16011         regdump_intflags("r->intflags: ",r->intflags);
16012     });
16013 #else
16014     PERL_ARGS_ASSERT_REGDUMP;
16015     PERL_UNUSED_CONTEXT;
16016     PERL_UNUSED_ARG(r);
16017 #endif  /* DEBUGGING */
16018 }
16019
16020 /*
16021 - regprop - printable representation of opcode, with run time support
16022 */
16023
16024 void
16025 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
16026 {
16027 #ifdef DEBUGGING
16028     int k;
16029
16030     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16031     static const char * const anyofs[] = {
16032 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16033     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16034     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16035     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16036     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16037     || _CC_VERTSPACE != 16
16038   #error Need to adjust order of anyofs[]
16039 #endif
16040         "\\w",
16041         "\\W",
16042         "\\d",
16043         "\\D",
16044         "[:alpha:]",
16045         "[:^alpha:]",
16046         "[:lower:]",
16047         "[:^lower:]",
16048         "[:upper:]",
16049         "[:^upper:]",
16050         "[:punct:]",
16051         "[:^punct:]",
16052         "[:print:]",
16053         "[:^print:]",
16054         "[:alnum:]",
16055         "[:^alnum:]",
16056         "[:graph:]",
16057         "[:^graph:]",
16058         "[:cased:]",
16059         "[:^cased:]",
16060         "\\s",
16061         "\\S",
16062         "[:blank:]",
16063         "[:^blank:]",
16064         "[:xdigit:]",
16065         "[:^xdigit:]",
16066         "[:space:]",
16067         "[:^space:]",
16068         "[:cntrl:]",
16069         "[:^cntrl:]",
16070         "[:ascii:]",
16071         "[:^ascii:]",
16072         "\\v",
16073         "\\V"
16074     };
16075     RXi_GET_DECL(prog,progi);
16076     GET_RE_DEBUG_FLAGS_DECL;
16077
16078     PERL_ARGS_ASSERT_REGPROP;
16079
16080     sv_setpvs(sv, "");
16081
16082     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16083         /* It would be nice to FAIL() here, but this may be called from
16084            regexec.c, and it would be hard to supply pRExC_state. */
16085         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16086                                               (int)OP(o), (int)REGNODE_MAX);
16087     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16088
16089     k = PL_regkind[OP(o)];
16090
16091     if (k == EXACT) {
16092         sv_catpvs(sv, " ");
16093         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16094          * is a crude hack but it may be the best for now since
16095          * we have no flag "this EXACTish node was UTF-8"
16096          * --jhi */
16097         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16098                   PERL_PV_ESCAPE_UNI_DETECT |
16099                   PERL_PV_ESCAPE_NONASCII   |
16100                   PERL_PV_PRETTY_ELLIPSES   |
16101                   PERL_PV_PRETTY_LTGT       |
16102                   PERL_PV_PRETTY_NOCLEAR
16103                   );
16104     } else if (k == TRIE) {
16105         /* print the details of the trie in dumpuntil instead, as
16106          * progi->data isn't available here */
16107         const char op = OP(o);
16108         const U32 n = ARG(o);
16109         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16110                (reg_ac_data *)progi->data->data[n] :
16111                NULL;
16112         const reg_trie_data * const trie
16113             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16114
16115         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16116         DEBUG_TRIE_COMPILE_r(
16117           Perl_sv_catpvf(aTHX_ sv,
16118             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16119             (UV)trie->startstate,
16120             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16121             (UV)trie->wordcount,
16122             (UV)trie->minlen,
16123             (UV)trie->maxlen,
16124             (UV)TRIE_CHARCOUNT(trie),
16125             (UV)trie->uniquecharcount
16126           );
16127         );
16128         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16129             sv_catpvs(sv, "[");
16130             (void) put_charclass_bitmap_innards(sv,
16131                                                 (IS_ANYOF_TRIE(op))
16132                                                  ? ANYOF_BITMAP(o)
16133                                                  : TRIE_BITMAP(trie),
16134                                                 NULL);
16135             sv_catpvs(sv, "]");
16136         }
16137
16138     } else if (k == CURLY) {
16139         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16140             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16141         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16142     }
16143     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16144         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16145     else if (k == REF || k == OPEN || k == CLOSE
16146              || k == GROUPP || OP(o)==ACCEPT)
16147     {
16148         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16149         if ( RXp_PAREN_NAMES(prog) ) {
16150             if ( k != REF || (OP(o) < NREF)) {
16151                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16152                 SV **name= av_fetch(list, ARG(o), 0 );
16153                 if (name)
16154                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16155             }
16156             else {
16157                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
16158                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16159                 I32 *nums=(I32*)SvPVX(sv_dat);
16160                 SV **name= av_fetch(list, nums[0], 0 );
16161                 I32 n;
16162                 if (name) {
16163                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16164                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16165                                     (n ? "," : ""), (IV)nums[n]);
16166                     }
16167                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16168                 }
16169             }
16170         }
16171         if ( k == REF && reginfo) {
16172             U32 n = ARG(o);  /* which paren pair */
16173             I32 ln = prog->offs[n].start;
16174             if (prog->lastparen < n || ln == -1)
16175                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16176             else if (ln == prog->offs[n].end)
16177                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16178             else {
16179                 const char *s = reginfo->strbeg + ln;
16180                 Perl_sv_catpvf(aTHX_ sv, ": ");
16181                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16182                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16183             }
16184         }
16185     } else if (k == GOSUB)
16186         /* Paren and offset */
16187         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16188     else if (k == VERB) {
16189         if (!o->flags)
16190             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16191                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16192     } else if (k == LOGICAL)
16193         /* 2: embedded, otherwise 1 */
16194         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16195     else if (k == ANYOF) {
16196         const U8 flags = ANYOF_FLAGS(o);
16197         int do_sep = 0;
16198         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16199
16200
16201         if (flags & ANYOF_LOCALE_FLAGS)
16202             sv_catpvs(sv, "{loc}");
16203         if (flags & ANYOF_LOC_FOLD)
16204             sv_catpvs(sv, "{i}");
16205         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16206         if (flags & ANYOF_INVERT)
16207             sv_catpvs(sv, "^");
16208
16209         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16210          * */
16211         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16212                                                             &bitmap_invlist);
16213
16214         /* output any special charclass tests (used entirely under use
16215          * locale) * */
16216         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16217             int i;
16218             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16219                 if (ANYOF_POSIXL_TEST(o,i)) {
16220                     sv_catpv(sv, anyofs[i]);
16221                     do_sep = 1;
16222                 }
16223             }
16224         }
16225
16226         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16227                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16228                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16229                       |ANYOF_LOC_FOLD)))
16230         {
16231             if (do_sep) {
16232                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16233                 if (flags & ANYOF_INVERT)
16234                     /*make sure the invert info is in each */
16235                     sv_catpvs(sv, "^");
16236             }
16237
16238             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16239                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16240             }
16241
16242             /* output information about the unicode matching */
16243             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16244                 sv_catpvs(sv, "{above_bitmap_all}");
16245             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16246                 SV *lv; /* Set if there is something outside the bit map. */
16247                 bool byte_output = FALSE;   /* If something in the bitmap has
16248                                                been output */
16249                 SV *only_utf8_locale;
16250
16251                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16252                  * is used to guarantee that nothing in the bitmap gets
16253                  * returned */
16254                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16255                                                     &lv, &only_utf8_locale,
16256                                                     bitmap_invlist);
16257                 if (lv && lv != &PL_sv_undef) {
16258                     char *s = savesvpv(lv);
16259                     char * const origs = s;
16260
16261                     while (*s && *s != '\n')
16262                         s++;
16263
16264                     if (*s == '\n') {
16265                         const char * const t = ++s;
16266
16267                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16268                             sv_catpvs(sv, "{outside bitmap}");
16269                         }
16270                         else {
16271                             sv_catpvs(sv, "{utf8}");
16272                         }
16273
16274                         if (byte_output) {
16275                             sv_catpvs(sv, " ");
16276                         }
16277
16278                         while (*s) {
16279                             if (*s == '\n') {
16280
16281                                 /* Truncate very long output */
16282                                 if (s - origs > 256) {
16283                                     Perl_sv_catpvf(aTHX_ sv,
16284                                                 "%.*s...",
16285                                                 (int) (s - origs - 1),
16286                                                 t);
16287                                     goto out_dump;
16288                                 }
16289                                 *s = ' ';
16290                             }
16291                             else if (*s == '\t') {
16292                                 *s = '-';
16293                             }
16294                             s++;
16295                         }
16296                         if (s[-1] == ' ')
16297                             s[-1] = 0;
16298
16299                         sv_catpv(sv, t);
16300                     }
16301
16302                 out_dump:
16303
16304                     Safefree(origs);
16305                     SvREFCNT_dec_NN(lv);
16306                 }
16307
16308                 if ((flags & ANYOF_LOC_FOLD)
16309                      && only_utf8_locale
16310                      && only_utf8_locale != &PL_sv_undef)
16311                 {
16312                     UV start, end;
16313                     int max_entries = 256;
16314
16315                     sv_catpvs(sv, "{utf8 locale}");
16316                     invlist_iterinit(only_utf8_locale);
16317                     while (invlist_iternext(only_utf8_locale,
16318                                             &start, &end)) {
16319                         put_range(sv, start, end, FALSE);
16320                         max_entries --;
16321                         if (max_entries < 0) {
16322                             sv_catpvs(sv, "...");
16323                             break;
16324                         }
16325                     }
16326                     invlist_iterfinish(only_utf8_locale);
16327                 }
16328             }
16329         }
16330         SvREFCNT_dec(bitmap_invlist);
16331
16332
16333         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16334     }
16335     else if (k == POSIXD || k == NPOSIXD) {
16336         U8 index = FLAGS(o) * 2;
16337         if (index < C_ARRAY_LENGTH(anyofs)) {
16338             if (*anyofs[index] != '[')  {
16339                 sv_catpv(sv, "[");
16340             }
16341             sv_catpv(sv, anyofs[index]);
16342             if (*anyofs[index] != '[')  {
16343                 sv_catpv(sv, "]");
16344             }
16345         }
16346         else {
16347             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16348         }
16349     }
16350     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16351         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16352     else if (OP(o) == SBOL)
16353         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16354 #else
16355     PERL_UNUSED_CONTEXT;
16356     PERL_UNUSED_ARG(sv);
16357     PERL_UNUSED_ARG(o);
16358     PERL_UNUSED_ARG(prog);
16359     PERL_UNUSED_ARG(reginfo);
16360 #endif  /* DEBUGGING */
16361 }
16362
16363
16364
16365 SV *
16366 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16367 {                               /* Assume that RE_INTUIT is set */
16368     struct regexp *const prog = ReANY(r);
16369     GET_RE_DEBUG_FLAGS_DECL;
16370
16371     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16372     PERL_UNUSED_CONTEXT;
16373
16374     DEBUG_COMPILE_r(
16375         {
16376             const char * const s = SvPV_nolen_const(prog->check_substr
16377                       ? prog->check_substr : prog->check_utf8);
16378
16379             if (!PL_colorset) reginitcolors();
16380             PerlIO_printf(Perl_debug_log,
16381                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16382                       PL_colors[4],
16383                       prog->check_substr ? "" : "utf8 ",
16384                       PL_colors[5],PL_colors[0],
16385                       s,
16386                       PL_colors[1],
16387                       (strlen(s) > 60 ? "..." : ""));
16388         } );
16389
16390     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16391 }
16392
16393 /*
16394    pregfree()
16395
16396    handles refcounting and freeing the perl core regexp structure. When
16397    it is necessary to actually free the structure the first thing it
16398    does is call the 'free' method of the regexp_engine associated to
16399    the regexp, allowing the handling of the void *pprivate; member
16400    first. (This routine is not overridable by extensions, which is why
16401    the extensions free is called first.)
16402
16403    See regdupe and regdupe_internal if you change anything here.
16404 */
16405 #ifndef PERL_IN_XSUB_RE
16406 void
16407 Perl_pregfree(pTHX_ REGEXP *r)
16408 {
16409     SvREFCNT_dec(r);
16410 }
16411
16412 void
16413 Perl_pregfree2(pTHX_ REGEXP *rx)
16414 {
16415     struct regexp *const r = ReANY(rx);
16416     GET_RE_DEBUG_FLAGS_DECL;
16417
16418     PERL_ARGS_ASSERT_PREGFREE2;
16419
16420     if (r->mother_re) {
16421         ReREFCNT_dec(r->mother_re);
16422     } else {
16423         CALLREGFREE_PVT(rx); /* free the private data */
16424         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16425         Safefree(r->xpv_len_u.xpvlenu_pv);
16426     }
16427     if (r->substrs) {
16428         SvREFCNT_dec(r->anchored_substr);
16429         SvREFCNT_dec(r->anchored_utf8);
16430         SvREFCNT_dec(r->float_substr);
16431         SvREFCNT_dec(r->float_utf8);
16432         Safefree(r->substrs);
16433     }
16434     RX_MATCH_COPY_FREE(rx);
16435 #ifdef PERL_ANY_COW
16436     SvREFCNT_dec(r->saved_copy);
16437 #endif
16438     Safefree(r->offs);
16439     SvREFCNT_dec(r->qr_anoncv);
16440     rx->sv_u.svu_rx = 0;
16441 }
16442
16443 /*  reg_temp_copy()
16444
16445     This is a hacky workaround to the structural issue of match results
16446     being stored in the regexp structure which is in turn stored in
16447     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16448     could be PL_curpm in multiple contexts, and could require multiple
16449     result sets being associated with the pattern simultaneously, such
16450     as when doing a recursive match with (??{$qr})
16451
16452     The solution is to make a lightweight copy of the regexp structure
16453     when a qr// is returned from the code executed by (??{$qr}) this
16454     lightweight copy doesn't actually own any of its data except for
16455     the starp/end and the actual regexp structure itself.
16456
16457 */
16458
16459
16460 REGEXP *
16461 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16462 {
16463     struct regexp *ret;
16464     struct regexp *const r = ReANY(rx);
16465     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16466
16467     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16468
16469     if (!ret_x)
16470         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16471     else {
16472         SvOK_off((SV *)ret_x);
16473         if (islv) {
16474             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16475                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16476                made both spots point to the same regexp body.) */
16477             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16478             assert(!SvPVX(ret_x));
16479             ret_x->sv_u.svu_rx = temp->sv_any;
16480             temp->sv_any = NULL;
16481             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16482             SvREFCNT_dec_NN(temp);
16483             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16484                ing below will not set it. */
16485             SvCUR_set(ret_x, SvCUR(rx));
16486         }
16487     }
16488     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16489        sv_force_normal(sv) is called.  */
16490     SvFAKE_on(ret_x);
16491     ret = ReANY(ret_x);
16492
16493     SvFLAGS(ret_x) |= SvUTF8(rx);
16494     /* We share the same string buffer as the original regexp, on which we
16495        hold a reference count, incremented when mother_re is set below.
16496        The string pointer is copied here, being part of the regexp struct.
16497      */
16498     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16499            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16500     if (r->offs) {
16501         const I32 npar = r->nparens+1;
16502         Newx(ret->offs, npar, regexp_paren_pair);
16503         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16504     }
16505     if (r->substrs) {
16506         Newx(ret->substrs, 1, struct reg_substr_data);
16507         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16508
16509         SvREFCNT_inc_void(ret->anchored_substr);
16510         SvREFCNT_inc_void(ret->anchored_utf8);
16511         SvREFCNT_inc_void(ret->float_substr);
16512         SvREFCNT_inc_void(ret->float_utf8);
16513
16514         /* check_substr and check_utf8, if non-NULL, point to either their
16515            anchored or float namesakes, and don't hold a second reference.  */
16516     }
16517     RX_MATCH_COPIED_off(ret_x);
16518 #ifdef PERL_ANY_COW
16519     ret->saved_copy = NULL;
16520 #endif
16521     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16522     SvREFCNT_inc_void(ret->qr_anoncv);
16523
16524     return ret_x;
16525 }
16526 #endif
16527
16528 /* regfree_internal()
16529
16530    Free the private data in a regexp. This is overloadable by
16531    extensions. Perl takes care of the regexp structure in pregfree(),
16532    this covers the *pprivate pointer which technically perl doesn't
16533    know about, however of course we have to handle the
16534    regexp_internal structure when no extension is in use.
16535
16536    Note this is called before freeing anything in the regexp
16537    structure.
16538  */
16539
16540 void
16541 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16542 {
16543     struct regexp *const r = ReANY(rx);
16544     RXi_GET_DECL(r,ri);
16545     GET_RE_DEBUG_FLAGS_DECL;
16546
16547     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16548
16549     DEBUG_COMPILE_r({
16550         if (!PL_colorset)
16551             reginitcolors();
16552         {
16553             SV *dsv= sv_newmortal();
16554             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16555                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16556             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16557                 PL_colors[4],PL_colors[5],s);
16558         }
16559     });
16560 #ifdef RE_TRACK_PATTERN_OFFSETS
16561     if (ri->u.offsets)
16562         Safefree(ri->u.offsets);             /* 20010421 MJD */
16563 #endif
16564     if (ri->code_blocks) {
16565         int n;
16566         for (n = 0; n < ri->num_code_blocks; n++)
16567             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16568         Safefree(ri->code_blocks);
16569     }
16570
16571     if (ri->data) {
16572         int n = ri->data->count;
16573
16574         while (--n >= 0) {
16575           /* If you add a ->what type here, update the comment in regcomp.h */
16576             switch (ri->data->what[n]) {
16577             case 'a':
16578             case 'r':
16579             case 's':
16580             case 'S':
16581             case 'u':
16582                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16583                 break;
16584             case 'f':
16585                 Safefree(ri->data->data[n]);
16586                 break;
16587             case 'l':
16588             case 'L':
16589                 break;
16590             case 'T':
16591                 { /* Aho Corasick add-on structure for a trie node.
16592                      Used in stclass optimization only */
16593                     U32 refcount;
16594                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16595 #ifdef USE_ITHREADS
16596                     dVAR;
16597 #endif
16598                     OP_REFCNT_LOCK;
16599                     refcount = --aho->refcount;
16600                     OP_REFCNT_UNLOCK;
16601                     if ( !refcount ) {
16602                         PerlMemShared_free(aho->states);
16603                         PerlMemShared_free(aho->fail);
16604                          /* do this last!!!! */
16605                         PerlMemShared_free(ri->data->data[n]);
16606                         /* we should only ever get called once, so
16607                          * assert as much, and also guard the free
16608                          * which /might/ happen twice. At the least
16609                          * it will make code anlyzers happy and it
16610                          * doesn't cost much. - Yves */
16611                         assert(ri->regstclass);
16612                         if (ri->regstclass) {
16613                             PerlMemShared_free(ri->regstclass);
16614                             ri->regstclass = 0;
16615                         }
16616                     }
16617                 }
16618                 break;
16619             case 't':
16620                 {
16621                     /* trie structure. */
16622                     U32 refcount;
16623                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16624 #ifdef USE_ITHREADS
16625                     dVAR;
16626 #endif
16627                     OP_REFCNT_LOCK;
16628                     refcount = --trie->refcount;
16629                     OP_REFCNT_UNLOCK;
16630                     if ( !refcount ) {
16631                         PerlMemShared_free(trie->charmap);
16632                         PerlMemShared_free(trie->states);
16633                         PerlMemShared_free(trie->trans);
16634                         if (trie->bitmap)
16635                             PerlMemShared_free(trie->bitmap);
16636                         if (trie->jump)
16637                             PerlMemShared_free(trie->jump);
16638                         PerlMemShared_free(trie->wordinfo);
16639                         /* do this last!!!! */
16640                         PerlMemShared_free(ri->data->data[n]);
16641                     }
16642                 }
16643                 break;
16644             default:
16645                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16646                                                     ri->data->what[n]);
16647             }
16648         }
16649         Safefree(ri->data->what);
16650         Safefree(ri->data);
16651     }
16652
16653     Safefree(ri);
16654 }
16655
16656 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16657 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16658 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16659
16660 /*
16661    re_dup - duplicate a regexp.
16662
16663    This routine is expected to clone a given regexp structure. It is only
16664    compiled under USE_ITHREADS.
16665
16666    After all of the core data stored in struct regexp is duplicated
16667    the regexp_engine.dupe method is used to copy any private data
16668    stored in the *pprivate pointer. This allows extensions to handle
16669    any duplication it needs to do.
16670
16671    See pregfree() and regfree_internal() if you change anything here.
16672 */
16673 #if defined(USE_ITHREADS)
16674 #ifndef PERL_IN_XSUB_RE
16675 void
16676 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16677 {
16678     dVAR;
16679     I32 npar;
16680     const struct regexp *r = ReANY(sstr);
16681     struct regexp *ret = ReANY(dstr);
16682
16683     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16684
16685     npar = r->nparens+1;
16686     Newx(ret->offs, npar, regexp_paren_pair);
16687     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16688
16689     if (ret->substrs) {
16690         /* Do it this way to avoid reading from *r after the StructCopy().
16691            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16692            cache, it doesn't matter.  */
16693         const bool anchored = r->check_substr
16694             ? r->check_substr == r->anchored_substr
16695             : r->check_utf8 == r->anchored_utf8;
16696         Newx(ret->substrs, 1, struct reg_substr_data);
16697         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16698
16699         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16700         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16701         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16702         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16703
16704         /* check_substr and check_utf8, if non-NULL, point to either their
16705            anchored or float namesakes, and don't hold a second reference.  */
16706
16707         if (ret->check_substr) {
16708             if (anchored) {
16709                 assert(r->check_utf8 == r->anchored_utf8);
16710                 ret->check_substr = ret->anchored_substr;
16711                 ret->check_utf8 = ret->anchored_utf8;
16712             } else {
16713                 assert(r->check_substr == r->float_substr);
16714                 assert(r->check_utf8 == r->float_utf8);
16715                 ret->check_substr = ret->float_substr;
16716                 ret->check_utf8 = ret->float_utf8;
16717             }
16718         } else if (ret->check_utf8) {
16719             if (anchored) {
16720                 ret->check_utf8 = ret->anchored_utf8;
16721             } else {
16722                 ret->check_utf8 = ret->float_utf8;
16723             }
16724         }
16725     }
16726
16727     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16728     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16729
16730     if (ret->pprivate)
16731         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16732
16733     if (RX_MATCH_COPIED(dstr))
16734         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16735     else
16736         ret->subbeg = NULL;
16737 #ifdef PERL_ANY_COW
16738     ret->saved_copy = NULL;
16739 #endif
16740
16741     /* Whether mother_re be set or no, we need to copy the string.  We
16742        cannot refrain from copying it when the storage points directly to
16743        our mother regexp, because that's
16744                1: a buffer in a different thread
16745                2: something we no longer hold a reference on
16746                so we need to copy it locally.  */
16747     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16748     ret->mother_re   = NULL;
16749 }
16750 #endif /* PERL_IN_XSUB_RE */
16751
16752 /*
16753    regdupe_internal()
16754
16755    This is the internal complement to regdupe() which is used to copy
16756    the structure pointed to by the *pprivate pointer in the regexp.
16757    This is the core version of the extension overridable cloning hook.
16758    The regexp structure being duplicated will be copied by perl prior
16759    to this and will be provided as the regexp *r argument, however
16760    with the /old/ structures pprivate pointer value. Thus this routine
16761    may override any copying normally done by perl.
16762
16763    It returns a pointer to the new regexp_internal structure.
16764 */
16765
16766 void *
16767 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16768 {
16769     dVAR;
16770     struct regexp *const r = ReANY(rx);
16771     regexp_internal *reti;
16772     int len;
16773     RXi_GET_DECL(r,ri);
16774
16775     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16776
16777     len = ProgLen(ri);
16778
16779     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16780           char, regexp_internal);
16781     Copy(ri->program, reti->program, len+1, regnode);
16782
16783     reti->num_code_blocks = ri->num_code_blocks;
16784     if (ri->code_blocks) {
16785         int n;
16786         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16787                 struct reg_code_block);
16788         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16789                 struct reg_code_block);
16790         for (n = 0; n < ri->num_code_blocks; n++)
16791              reti->code_blocks[n].src_regex = (REGEXP*)
16792                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16793     }
16794     else
16795         reti->code_blocks = NULL;
16796
16797     reti->regstclass = NULL;
16798
16799     if (ri->data) {
16800         struct reg_data *d;
16801         const int count = ri->data->count;
16802         int i;
16803
16804         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16805                 char, struct reg_data);
16806         Newx(d->what, count, U8);
16807
16808         d->count = count;
16809         for (i = 0; i < count; i++) {
16810             d->what[i] = ri->data->what[i];
16811             switch (d->what[i]) {
16812                 /* see also regcomp.h and regfree_internal() */
16813             case 'a': /* actually an AV, but the dup function is identical.  */
16814             case 'r':
16815             case 's':
16816             case 'S':
16817             case 'u': /* actually an HV, but the dup function is identical.  */
16818                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16819                 break;
16820             case 'f':
16821                 /* This is cheating. */
16822                 Newx(d->data[i], 1, regnode_ssc);
16823                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16824                 reti->regstclass = (regnode*)d->data[i];
16825                 break;
16826             case 'T':
16827                 /* Trie stclasses are readonly and can thus be shared
16828                  * without duplication. We free the stclass in pregfree
16829                  * when the corresponding reg_ac_data struct is freed.
16830                  */
16831                 reti->regstclass= ri->regstclass;
16832                 /* FALLTHROUGH */
16833             case 't':
16834                 OP_REFCNT_LOCK;
16835                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16836                 OP_REFCNT_UNLOCK;
16837                 /* FALLTHROUGH */
16838             case 'l':
16839             case 'L':
16840                 d->data[i] = ri->data->data[i];
16841                 break;
16842             default:
16843                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16844                                                            ri->data->what[i]);
16845             }
16846         }
16847
16848         reti->data = d;
16849     }
16850     else
16851         reti->data = NULL;
16852
16853     reti->name_list_idx = ri->name_list_idx;
16854
16855 #ifdef RE_TRACK_PATTERN_OFFSETS
16856     if (ri->u.offsets) {
16857         Newx(reti->u.offsets, 2*len+1, U32);
16858         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16859     }
16860 #else
16861     SetProgLen(reti,len);
16862 #endif
16863
16864     return (void*)reti;
16865 }
16866
16867 #endif    /* USE_ITHREADS */
16868
16869 #ifndef PERL_IN_XSUB_RE
16870
16871 /*
16872  - regnext - dig the "next" pointer out of a node
16873  */
16874 regnode *
16875 Perl_regnext(pTHX_ regnode *p)
16876 {
16877     I32 offset;
16878
16879     if (!p)
16880         return(NULL);
16881
16882     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16883         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16884                                                 (int)OP(p), (int)REGNODE_MAX);
16885     }
16886
16887     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16888     if (offset == 0)
16889         return(NULL);
16890
16891     return(p+offset);
16892 }
16893 #endif
16894
16895 STATIC void
16896 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16897 {
16898     va_list args;
16899     STRLEN l1 = strlen(pat1);
16900     STRLEN l2 = strlen(pat2);
16901     char buf[512];
16902     SV *msv;
16903     const char *message;
16904
16905     PERL_ARGS_ASSERT_RE_CROAK2;
16906
16907     if (l1 > 510)
16908         l1 = 510;
16909     if (l1 + l2 > 510)
16910         l2 = 510 - l1;
16911     Copy(pat1, buf, l1 , char);
16912     Copy(pat2, buf + l1, l2 , char);
16913     buf[l1 + l2] = '\n';
16914     buf[l1 + l2 + 1] = '\0';
16915     va_start(args, pat2);
16916     msv = vmess(buf, &args);
16917     va_end(args);
16918     message = SvPV_const(msv,l1);
16919     if (l1 > 512)
16920         l1 = 512;
16921     Copy(message, buf, l1 , char);
16922     /* l1-1 to avoid \n */
16923     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16924 }
16925
16926 #ifdef DEBUGGING
16927 /* Certain characters are output as a sequence with the first being a
16928  * backslash. */
16929 #define isBACKSLASHED_PUNCT(c)                                              \
16930                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
16931
16932 STATIC void
16933 S_put_code_point(pTHX_ SV *sv, UV c)
16934 {
16935     PERL_ARGS_ASSERT_PUT_CODE_POINT;
16936
16937     if (c > 255) {
16938         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
16939     }
16940     else if (isPRINT(c)) {
16941         const char string = (char) c;
16942         if (isBACKSLASHED_PUNCT(c))
16943             sv_catpvs(sv, "\\");
16944         sv_catpvn(sv, &string, 1);
16945     }
16946     else {
16947         const char * const mnemonic = cntrl_to_mnemonic((char) c);
16948         if (mnemonic) {
16949             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
16950         }
16951         else {
16952             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
16953         }
16954     }
16955 }
16956
16957 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
16958
16959 STATIC void
16960 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
16961 {
16962     /* Appends to 'sv' a displayable version of the range of code points from
16963      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
16964      * as-is (though some of these will be escaped by put_code_point()). */
16965
16966     const unsigned int min_range_count = 3;
16967
16968     assert(start <= end);
16969
16970     PERL_ARGS_ASSERT_PUT_RANGE;
16971
16972     while (start <= end) {
16973         UV this_end;
16974         const char * format;
16975
16976         if (end - start < min_range_count) {
16977
16978             /* Individual chars in short ranges */
16979             for (; start <= end; start++) {
16980                 put_code_point(sv, start);
16981             }
16982             break;
16983         }
16984
16985         /* If permitted by the input options, and there is a possibility that
16986          * this range contains a printable literal, look to see if there is
16987          * one.  */
16988         if (allow_literals && start <= MAX_PRINT_A) {
16989
16990             /* If the range begin isn't an ASCII printable, effectively split
16991              * the range into two parts:
16992              *  1) the portion before the first such printable,
16993              *  2) the rest
16994              * and output them separately. */
16995             if (! isPRINT_A(start)) {
16996                 UV temp_end = start + 1;
16997
16998                 /* There is no point looking beyond the final possible
16999                  * printable, in MAX_PRINT_A */
17000                 UV max = MIN(end, MAX_PRINT_A);
17001
17002                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17003                     temp_end++;
17004                 }
17005
17006                 /* Here, temp_end points to one beyond the first printable if
17007                  * found, or to one beyond 'max' if not.  If none found, make
17008                  * sure that we use the entire range */
17009                 if (temp_end > MAX_PRINT_A) {
17010                     temp_end = end + 1;
17011                 }
17012
17013                 /* Output the first part of the split range, the part that
17014                  * doesn't have printables, with no looking for literals
17015                  * (otherwise we would infinitely recurse) */
17016                 put_range(sv, start, temp_end - 1, FALSE);
17017
17018                 /* The 2nd part of the range (if any) starts here. */
17019                 start = temp_end;
17020
17021                 /* We continue instead of dropping down because even if the 2nd
17022                  * part is non-empty, it could be so short that we want to
17023                  * output it specially, as tested for at the top of this loop.
17024                  * */
17025                 continue;
17026             }
17027
17028             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17029              * output a sub-range of just the digits or letters, then process
17030              * the remaining portion as usual. */
17031             if (isALPHANUMERIC_A(start)) {
17032                 UV mask = (isDIGIT_A(start))
17033                            ? _CC_DIGIT
17034                              : isUPPER_A(start)
17035                                ? _CC_UPPER
17036                                : _CC_LOWER;
17037                 UV temp_end = start + 1;
17038
17039                 /* Find the end of the sub-range that includes just the
17040                  * characters in the same class as the first character in it */
17041                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17042                     temp_end++;
17043                 }
17044                 temp_end--;
17045
17046                 /* For short ranges, don't duplicate the code above to output
17047                  * them; just call recursively */
17048                 if (temp_end - start < min_range_count) {
17049                     put_range(sv, start, temp_end, FALSE);
17050                 }
17051                 else {  /* Output as a range */
17052                     put_code_point(sv, start);
17053                     sv_catpvs(sv, "-");
17054                     put_code_point(sv, temp_end);
17055                 }
17056                 start = temp_end + 1;
17057                 continue;
17058             }
17059
17060             /* We output any other printables as individual characters */
17061             if (isPUNCT_A(start) || isSPACE_A(start)) {
17062                 while (start <= end && (isPUNCT_A(start)
17063                                         || isSPACE_A(start)))
17064                 {
17065                     put_code_point(sv, start);
17066                     start++;
17067                 }
17068                 continue;
17069             }
17070         } /* End of looking for literals */
17071
17072         /* Here is not to output as a literal.  Some control characters have
17073          * mnemonic names.  Split off any of those at the beginning and end of
17074          * the range to print mnemonically.  It isn't possible for many of
17075          * these to be in a row, so this won't overwhelm with output */
17076         while (isMNEMONIC_CNTRL(start) && start <= end) {
17077             put_code_point(sv, start);
17078             start++;
17079         }
17080         if (start < end && isMNEMONIC_CNTRL(end)) {
17081
17082             /* Here, the final character in the range has a mnemonic name.
17083              * Work backwards from the end to find the final non-mnemonic */
17084             UV temp_end = end - 1;
17085             while (isMNEMONIC_CNTRL(temp_end)) {
17086                 temp_end--;
17087             }
17088
17089             /* And separately output the range that doesn't have mnemonics */
17090             put_range(sv, start, temp_end, FALSE);
17091
17092             /* Then output the mnemonic trailing controls */
17093             start = temp_end + 1;
17094             while (start <= end) {
17095                 put_code_point(sv, start);
17096                 start++;
17097             }
17098             break;
17099         }
17100
17101         /* As a final resort, output the range or subrange as hex. */
17102
17103         this_end = (end < NUM_ANYOF_CODE_POINTS)
17104                     ? end
17105                     : NUM_ANYOF_CODE_POINTS - 1;
17106         format = (this_end < 256)
17107                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17108                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17109         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17110         break;
17111     }
17112 }
17113
17114 STATIC bool
17115 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17116 {
17117     /* Appends to 'sv' a displayable version of the innards of the bracketed
17118      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17119      * output anything, and bitmap_invlist, if not NULL, will point to an
17120      * inversion list of what is in the bit map */
17121
17122     int i;
17123     UV start, end;
17124     unsigned int punct_count = 0;
17125     SV* invlist = NULL;
17126     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17127     bool allow_literals = TRUE;
17128
17129     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17130
17131     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17132
17133     /* Worst case is exactly every-other code point is in the list */
17134     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17135
17136     /* Convert the bit map to an inversion list, keeping track of how many
17137      * ASCII puncts are set, including an extra amount for the backslashed
17138      * ones.  */
17139     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17140         if (BITMAP_TEST(bitmap, i)) {
17141             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17142             if (isPUNCT_A(i)) {
17143                 punct_count++;
17144                 if isBACKSLASHED_PUNCT(i) {
17145                     punct_count++;
17146                 }
17147             }
17148         }
17149     }
17150
17151     /* Nothing to output */
17152     if (_invlist_len(*invlist_ptr) == 0) {
17153         SvREFCNT_dec(invlist);
17154         return FALSE;
17155     }
17156
17157     /* Generally, it is more readable if printable characters are output as
17158      * literals, but if a range (nearly) spans all of them, it's best to output
17159      * it as a single range.  This code will use a single range if all but 2
17160      * printables are in it */
17161     invlist_iterinit(*invlist_ptr);
17162     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17163
17164         /* If range starts beyond final printable, it doesn't have any in it */
17165         if (start > MAX_PRINT_A) {
17166             break;
17167         }
17168
17169         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17170          * all but two, the range must start and end no later than 2 from
17171          * either end */
17172         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17173             if (end > MAX_PRINT_A) {
17174                 end = MAX_PRINT_A;
17175             }
17176             if (start < ' ') {
17177                 start = ' ';
17178             }
17179             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17180                 allow_literals = FALSE;
17181             }
17182             break;
17183         }
17184     }
17185     invlist_iterfinish(*invlist_ptr);
17186
17187     /* The legibility of the output depends mostly on how many punctuation
17188      * characters are output.  There are 32 possible ASCII ones, and some have
17189      * an additional backslash, bringing it to currently 36, so if any more
17190      * than 18 are to be output, we can instead output it as its complement,
17191      * yielding fewer puncts, and making it more legible.  But give some weight
17192      * to the fact that outputting it as a complement is less legible than a
17193      * straight output, so don't complement unless we are somewhat over the 18
17194      * mark */
17195     if (allow_literals && punct_count > 22) {
17196         sv_catpvs(sv, "^");
17197
17198         /* Add everything remaining to the list, so when we invert it just
17199          * below, it will be excluded */
17200         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17201         _invlist_invert(*invlist_ptr);
17202     }
17203
17204     /* Here we have figured things out.  Output each range */
17205     invlist_iterinit(*invlist_ptr);
17206     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17207         if (start >= NUM_ANYOF_CODE_POINTS) {
17208             break;
17209         }
17210         put_range(sv, start, end, allow_literals);
17211     }
17212     invlist_iterfinish(*invlist_ptr);
17213
17214     return TRUE;
17215 }
17216
17217 #define CLEAR_OPTSTART \
17218     if (optstart) STMT_START {                                               \
17219         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17220                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17221         optstart=NULL;                                                       \
17222     } STMT_END
17223
17224 #define DUMPUNTIL(b,e)                                                       \
17225                     CLEAR_OPTSTART;                                          \
17226                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17227
17228 STATIC const regnode *
17229 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17230             const regnode *last, const regnode *plast,
17231             SV* sv, I32 indent, U32 depth)
17232 {
17233     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17234     const regnode *next;
17235     const regnode *optstart= NULL;
17236
17237     RXi_GET_DECL(r,ri);
17238     GET_RE_DEBUG_FLAGS_DECL;
17239
17240     PERL_ARGS_ASSERT_DUMPUNTIL;
17241
17242 #ifdef DEBUG_DUMPUNTIL
17243     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17244         last ? last-start : 0,plast ? plast-start : 0);
17245 #endif
17246
17247     if (plast && plast < last)
17248         last= plast;
17249
17250     while (PL_regkind[op] != END && (!last || node < last)) {
17251         assert(node);
17252         /* While that wasn't END last time... */
17253         NODE_ALIGN(node);
17254         op = OP(node);
17255         if (op == CLOSE || op == WHILEM)
17256             indent--;
17257         next = regnext((regnode *)node);
17258
17259         /* Where, what. */
17260         if (OP(node) == OPTIMIZED) {
17261             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17262                 optstart = node;
17263             else
17264                 goto after_print;
17265         } else
17266             CLEAR_OPTSTART;
17267
17268         regprop(r, sv, node, NULL);
17269         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17270                       (int)(2*indent + 1), "", SvPVX_const(sv));
17271
17272         if (OP(node) != OPTIMIZED) {
17273             if (next == NULL)           /* Next ptr. */
17274                 PerlIO_printf(Perl_debug_log, " (0)");
17275             else if (PL_regkind[(U8)op] == BRANCH
17276                      && PL_regkind[OP(next)] != BRANCH )
17277                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17278             else
17279                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17280             (void)PerlIO_putc(Perl_debug_log, '\n');
17281         }
17282
17283       after_print:
17284         if (PL_regkind[(U8)op] == BRANCHJ) {
17285             assert(next);
17286             {
17287                 const regnode *nnode = (OP(next) == LONGJMP
17288                                        ? regnext((regnode *)next)
17289                                        : next);
17290                 if (last && nnode > last)
17291                     nnode = last;
17292                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17293             }
17294         }
17295         else if (PL_regkind[(U8)op] == BRANCH) {
17296             assert(next);
17297             DUMPUNTIL(NEXTOPER(node), next);
17298         }
17299         else if ( PL_regkind[(U8)op]  == TRIE ) {
17300             const regnode *this_trie = node;
17301             const char op = OP(node);
17302             const U32 n = ARG(node);
17303             const reg_ac_data * const ac = op>=AHOCORASICK ?
17304                (reg_ac_data *)ri->data->data[n] :
17305                NULL;
17306             const reg_trie_data * const trie =
17307                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17308 #ifdef DEBUGGING
17309             AV *const trie_words
17310                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17311 #endif
17312             const regnode *nextbranch= NULL;
17313             I32 word_idx;
17314             sv_setpvs(sv, "");
17315             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17316                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17317
17318                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17319                    (int)(2*(indent+3)), "",
17320                     elem_ptr
17321                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17322                                 SvCUR(*elem_ptr), 60,
17323                                 PL_colors[0], PL_colors[1],
17324                                 (SvUTF8(*elem_ptr)
17325                                  ? PERL_PV_ESCAPE_UNI
17326                                  : 0)
17327                                 | PERL_PV_PRETTY_ELLIPSES
17328                                 | PERL_PV_PRETTY_LTGT
17329                             )
17330                     : "???"
17331                 );
17332                 if (trie->jump) {
17333                     U16 dist= trie->jump[word_idx+1];
17334                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17335                                (UV)((dist ? this_trie + dist : next) - start));
17336                     if (dist) {
17337                         if (!nextbranch)
17338                             nextbranch= this_trie + trie->jump[0];
17339                         DUMPUNTIL(this_trie + dist, nextbranch);
17340                     }
17341                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17342                         nextbranch= regnext((regnode *)nextbranch);
17343                 } else {
17344                     PerlIO_printf(Perl_debug_log, "\n");
17345                 }
17346             }
17347             if (last && next > last)
17348                 node= last;
17349             else
17350                 node= next;
17351         }
17352         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17353             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17354                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17355         }
17356         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17357             assert(next);
17358             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17359         }
17360         else if ( op == PLUS || op == STAR) {
17361             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17362         }
17363         else if (PL_regkind[(U8)op] == ANYOF) {
17364             /* arglen 1 + class block */
17365             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17366                           ? ANYOF_POSIXL_SKIP
17367                           : ANYOF_SKIP);
17368             node = NEXTOPER(node);
17369         }
17370         else if (PL_regkind[(U8)op] == EXACT) {
17371             /* Literal string, where present. */
17372             node += NODE_SZ_STR(node) - 1;
17373             node = NEXTOPER(node);
17374         }
17375         else {
17376             node = NEXTOPER(node);
17377             node += regarglen[(U8)op];
17378         }
17379         if (op == CURLYX || op == OPEN)
17380             indent++;
17381     }
17382     CLEAR_OPTSTART;
17383 #ifdef DEBUG_DUMPUNTIL
17384     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17385 #endif
17386     return node;
17387 }
17388
17389 #endif  /* DEBUGGING */
17390
17391 /*
17392  * Local variables:
17393  * c-indentation-style: bsd
17394  * c-basic-offset: 4
17395  * indent-tabs-mode: nil
17396  * End:
17397  *
17398  * ex: set ts=8 sts=4 sw=4 et:
17399  */